From f8cdf65b51f036d77edece8a175447dd41be63ca Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Mon, 4 Mar 2024 11:22:03 +0800 Subject: bcachefs: Fix null-ptr-deref in bch2_fs_alloc() There is a null-ptr-deref issue reported by kasan: KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] Call Trace: bch2_fs_alloc+0x1092/0x2170 [bcachefs] bch2_fs_open+0x683/0xe10 [bcachefs] ... When initializing the name of bch_fs, it needs to dynamically alloc memory to meet the length of the name. However, when name allocation failed, it will cause a null-ptr-deref access exception in subsequent string copy. Fix this issue by checking if name allocation is successful. Fixes: 401ec4db6308 ("bcachefs: Printbuf rework") Signed-off-by: Li Zetao Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 6b23e11825e6..24fa41bbe7e3 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -818,13 +818,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto err; pr_uuid(&name, c->sb.user_uuid.b); - strscpy(c->name, name.buf, sizeof(c->name)); - printbuf_exit(&name); - ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; if (ret) goto err; + strscpy(c->name, name.buf, sizeof(c->name)); + printbuf_exit(&name); + /* Compat: */ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) -- cgit v1.2.3 From 6fa30fe7f79592ae2ba6e44fa1e7589942c58abe Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 7 Mar 2024 12:30:49 -0500 Subject: bcachefs: journal_seq_blacklist_add() now handles entries being added out of order bch2_journal_seq_blacklist_add() was bugged when the new entry overlapped with multiple existing entries, and it also assumed new entries are being added in increasing order. This is true on any sane filesystem, but when trying to recover from very badly mangled filesystems we might end up with the journal sequence number rewinding vs. what the blacklist list knows about - easiest to just handle that here. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_seq_blacklist.c | 66 ++++++++++++------------------------- fs/bcachefs/recovery.c | 2 +- 2 files changed, 22 insertions(+), 46 deletions(-) diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index 0200e299cfbb..5acc1276675c 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -43,61 +43,36 @@ static unsigned sb_blacklist_u64s(unsigned nr) return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); } -static struct bch_sb_field_journal_seq_blacklist * -blacklist_entry_try_merge(struct bch_fs *c, - struct bch_sb_field_journal_seq_blacklist *bl, - unsigned i) -{ - unsigned nr = blacklist_nr_entries(bl); - - if (le64_to_cpu(bl->start[i].end) >= - le64_to_cpu(bl->start[i + 1].start)) { - bl->start[i].end = bl->start[i + 1].end; - --nr; - memmove(&bl->start[i], - &bl->start[i + 1], - sizeof(bl->start[0]) * (nr - i)); - - bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, - sb_blacklist_u64s(nr)); - BUG_ON(!bl); - } - - return bl; -} - -static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e, - u64 start, u64 end) -{ - return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start); -} - int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) { struct bch_sb_field_journal_seq_blacklist *bl; - unsigned i, nr; + unsigned i = 0, nr; int ret = 0; mutex_lock(&c->sb_lock); bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); nr = blacklist_nr_entries(bl); - for (i = 0; i < nr; i++) { + while (i < nr) { struct journal_seq_blacklist_entry *e = bl->start + i; - if (bl_entry_contig_or_overlaps(e, start, end)) { - e->start = cpu_to_le64(min(start, le64_to_cpu(e->start))); - e->end = cpu_to_le64(max(end, le64_to_cpu(e->end))); - - if (i + 1 < nr) - bl = blacklist_entry_try_merge(c, - bl, i); - if (i) - bl = blacklist_entry_try_merge(c, - bl, i - 1); - goto out_write_sb; + if (end < le64_to_cpu(e->start)) + break; + + if (start > le64_to_cpu(e->end)) { + i++; + continue; } + + /* + * Entry is contiguous or overlapping with new entry: merge it + * with new entry, and delete: + */ + + start = min(start, le64_to_cpu(e->start)); + end = max(end, le64_to_cpu(e->end)); + array_remove_item(bl->start, nr, i); } bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, @@ -107,9 +82,10 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) goto out; } - bl->start[nr].start = cpu_to_le64(start); - bl->start[nr].end = cpu_to_le64(end); -out_write_sb: + array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) { + .start = cpu_to_le64(start), + .end = cpu_to_le64(end), + })); c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); ret = bch2_write_super(c); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 21e13bb4335b..1aa21adc7ee5 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -950,7 +950,7 @@ use_clean: bch2_journal_seq_blacklist_add(c, blacklist_seq, journal_seq); if (ret) { - bch_err(c, "error creating new journal seq blacklist entry"); + bch_err_msg(c, ret, "error creating new journal seq blacklist entry"); goto err; } } -- cgit v1.2.3 From 88005d5dfbc9665b8cd510916baed9c89960ee90 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 8 Mar 2024 15:25:27 -0500 Subject: bcachefs: extent_entry_next_safe() We need to be able to iterate over extent ptrs that may be corrupted in order to print them - this fixes a bug where we'd pop an assert in bch2_bkey_durability_safe(). Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 6bf839d69e84..6219f2c08e4c 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -43,6 +43,11 @@ enum bkey_invalid_flags; #define extent_entry_next(_entry) \ ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) +#define extent_entry_next_safe(_entry, _end) \ + (likely(__extent_entry_type(_entry) < BCH_EXTENT_ENTRY_MAX) \ + ? extent_entry_next(_entry) \ + : _end) + static inline unsigned __extent_entry_type(const union bch_extent_entry *e) { @@ -280,7 +285,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) #define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ for ((_entry) = (_start); \ (_entry) < (_end); \ - (_entry) = extent_entry_next(_entry)) + (_entry) = extent_entry_next_safe(_entry, _end)) #define __bkey_ptr_next(_ptr, _end) \ ({ \ @@ -318,7 +323,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) (_ptr).has_ec = false; \ \ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ - switch (extent_entry_type(_entry)) { \ + switch (__extent_entry_type(_entry)) { \ case BCH_EXTENT_ENTRY_ptr: \ (_ptr).ptr = _entry->ptr; \ goto out; \ @@ -344,7 +349,7 @@ out: \ for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \ (_entry) = _start; \ __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \ - (_entry) = extent_entry_next(_entry)) + (_entry) = extent_entry_next_safe(_entry, _end)) #define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \ __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ -- cgit v1.2.3 From 2f300f09c7899eb35077aad0a1634cd06a29423a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 8 Mar 2024 16:03:19 -0500 Subject: bcachefs: no_splitbrain_check option This adds an option to disable kicking out devices when splitbrain is detected - it seems there's some issues with splitbrain detection and we're kicking out devices erronously. Signed-off-by: Kent Overstreet --- fs/bcachefs/opts.h | 5 +++++ fs/bcachefs/super.c | 25 +++++++++++++++++-------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 9a4b7faa3765..9a83aca31b4b 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -290,6 +290,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Allow mounting in when data will be missing") \ + x(no_splitbrain_check, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Don't kick drives out when splitbrain detected")\ x(discard, u8, \ OPT_FS|OPT_MOUNT|OPT_DEVICE, \ OPT_BOOL(), \ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 24fa41bbe7e3..73173f63f486 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1061,7 +1061,8 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) } static int bch2_dev_in_fs(struct bch_sb_handle *fs, - struct bch_sb_handle *sb) + struct bch_sb_handle *sb, + struct bch_opts *opts) { if (fs == sb) return 0; @@ -1102,11 +1103,14 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));; prt_newline(&buf); - prt_printf(&buf, "Not using older sb"); + if (!opts->no_splitbrain_check) + prt_printf(&buf, "Not using older sb"); pr_err("%s", buf.buf); printbuf_exit(&buf); - return -BCH_ERR_device_splitbrain; + + if (!opts->no_splitbrain_check) + return -BCH_ERR_device_splitbrain; } struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx); @@ -1129,12 +1133,17 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, prt_printf(&buf, " to be %llu, but ", seq_from_fs); prt_bdevname(&buf, sb->bdev); prt_printf(&buf, " has %llu\n", seq_from_member); - prt_str(&buf, "Not using "); - prt_bdevname(&buf, sb->bdev); + + if (!opts->no_splitbrain_check) { + prt_str(&buf, "Not using "); + prt_bdevname(&buf, sb->bdev); + } pr_err("%s", buf.buf); printbuf_exit(&buf); - return -BCH_ERR_device_splitbrain; + + if (!opts->no_splitbrain_check) + return -BCH_ERR_device_splitbrain; } return 0; @@ -1835,7 +1844,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path) dev_idx = sb.sb->dev_idx; - ret = bch2_dev_in_fs(&c->disk_sb, &sb); + ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts); bch_err_msg(c, ret, "bringing %s online", path); if (ret) goto err; @@ -2023,7 +2032,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, best = sb; darray_for_each_reverse(sbs, sb) { - ret = bch2_dev_in_fs(best, sb); + ret = bch2_dev_in_fs(best, sb, &opts); if (ret == -BCH_ERR_device_has_been_removed || ret == -BCH_ERR_device_splitbrain) { -- cgit v1.2.3 From 52f3a72fa7f4f021398d17e4ffa760d0b2a46386 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 27 Feb 2024 07:38:50 -0500 Subject: bcachefs: fix check_inode_deleted_list() check_inode_deleted_list() returns true if the inode is on the deleted list; check_inode() was checking the return code incorrectly. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 6a760777bafb..ac4e40ad168d 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -799,12 +799,9 @@ static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) { struct btree_iter iter; struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); - int ret = bkey_err(k); - if (ret) - return ret; - + int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set; bch2_trans_iter_exit(trans, &iter); - return k.k->type == KEY_TYPE_set; + return ret; } static int check_inode(struct btree_trans *trans, @@ -876,7 +873,7 @@ static int check_inode(struct btree_trans *trans, if (ret < 0) return ret; - fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list, + fsck_err_on(!ret, c, unlinked_inode_not_on_deleted_list, "inode %llu:%u unlinked, but not on deleted list", u.bi_inum, k.k->p.snapshot); ret = 0; -- cgit v1.2.3 From ba89083e9f5d9d26f64565ec3ecb823b5bcad055 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 8 Mar 2024 19:57:22 -0500 Subject: bcachefs: Fix journal replay with unreadable btree roots When a btree root is unreadable, we still might be able to get some data back by replaying what's in the journal. Previously though, we got confused when journal replay would attempt to replay a key for a level that didn't exist. This adds bch2_btree_increase_depth(), so that journal replay can handle this. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 4 ++- fs/bcachefs/btree_update_interior.c | 59 +++++++++++++++++++++++++++++++++---- fs/bcachefs/btree_update_interior.h | 2 ++ fs/bcachefs/recovery.c | 11 +++++++ 4 files changed, 70 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 3ef338df82f5..cab2e3fa900b 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1729,7 +1729,9 @@ bch2_btree_iter_traverse(struct btree_iter *iter) if (ret) return ret; - btree_path_set_should_be_locked(trans->paths + iter->path); + struct btree_path *path = btree_iter_path(trans, iter); + if (btree_path_node(path, path->level)) + btree_path_set_should_be_locked(path); return 0; } diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 4530b14ff2c3..7203ea8d5026 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1208,10 +1208,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) mutex_unlock(&c->btree_cache.lock); mutex_lock(&c->btree_root_lock); - BUG_ON(btree_node_root(c, b) && - (b->c.level < btree_node_root(c, b)->c.level || - !btree_node_dying(btree_node_root(c, b)))); - bch2_btree_id_root(c, b->c.btree_id)->b = b; mutex_unlock(&c->btree_root_lock); @@ -1747,7 +1743,6 @@ int bch2_btree_split_leaf(struct btree_trans *trans, unsigned flags) { /* btree_split & merge may both cause paths array to be reallocated */ - struct btree *b = path_l(trans->paths + path)->b; struct btree_update *as; unsigned l; @@ -1775,6 +1770,60 @@ int bch2_btree_split_leaf(struct btree_trans *trans, return ret; } +static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans, + btree_path_idx_t path_idx) +{ + struct bch_fs *c = as->c; + struct btree_path *path = trans->paths + path_idx; + struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b; + + BUG_ON(!btree_node_locked(path, b->c.level)); + + n = __btree_root_alloc(as, trans, b->c.level + 1); + + bch2_btree_update_add_new_node(as, n); + six_unlock_write(&n->c.lock); + + path->locks_want++; + BUG_ON(btree_node_locked(path, n->c.level)); + six_lock_increment(&n->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, path, n); + + n->sib_u64s[0] = U16_MAX; + n->sib_u64s[1] = U16_MAX; + + bch2_keylist_add(&as->parent_keys, &b->key); + btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys); + + bch2_btree_set_root(as, trans, path, n); + bch2_btree_update_get_open_buckets(as, n); + bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); + bch2_trans_node_add(trans, path, n); + six_unlock_intent(&n->c.lock); + + mutex_lock(&c->btree_cache.lock); + list_add_tail(&b->list, &c->btree_cache.live); + mutex_unlock(&c->btree_cache.lock); + + bch2_trans_verify_locks(trans); +} + +int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b; + struct btree_update *as = + bch2_btree_update_start(trans, trans->paths + path, + b->c.level, true, flags); + if (IS_ERR(as)) + return PTR_ERR(as); + + __btree_increase_depth(as, trans, path); + bch2_btree_update_done(as, trans); + return 0; +} + int __bch2_foreground_maybe_merge(struct btree_trans *trans, btree_path_idx_t path, unsigned level, diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index c593c925d1e3..3439b03719c7 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -119,6 +119,8 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned); +int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned); + int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t, unsigned, unsigned, enum btree_node_sibling); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 1aa21adc7ee5..39271d2d63d1 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -124,6 +124,17 @@ static int bch2_journal_replay_key(struct btree_trans *trans, if (ret) goto out; + struct btree_path *path = btree_iter_path(trans, &iter); + if (unlikely(!btree_path_node(path, k->level))) { + bch2_trans_iter_exit(trans, &iter); + bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, + BTREE_MAX_DEPTH, 0, iter_flags); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_btree_increase_depth(trans, iter.path, 0) ?: + -BCH_ERR_transaction_restart_nested; + goto out; + } + /* Must be checked with btree locked: */ if (k->overwritten) goto out; -- cgit v1.2.3 From b3eba6a4a7e3e148abfde7a30daa855839fcc043 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 10 Mar 2024 14:54:09 -0400 Subject: bcachefs: Fix degraded mode fsck We don't know where the superblock and journal lives on offline devices; that means if a device is offline fsck can't check those buckets. Previously, fsck would incorrectly clear bucket data types for those buckets on offline devices; now we just use the previous state. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 1102995643b1..829da0f92f0d 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -1365,11 +1365,10 @@ static int bch2_alloc_write_key(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); - struct bucket gc, *b; + struct bucket old_gc, gc, *b; struct bkey_i_alloc_v4 *a; struct bch_alloc_v4 old_convert, new; const struct bch_alloc_v4 *old; - enum bch_data_type type; int ret; old = bch2_alloc_to_v4(k, &old_convert); @@ -1377,30 +1376,31 @@ static int bch2_alloc_write_key(struct btree_trans *trans, percpu_down_read(&c->mark_lock); b = gc_bucket(ca, iter->pos.offset); + old_gc = *b; + + if ((old->data_type == BCH_DATA_sb || + old->data_type == BCH_DATA_journal) && + !bch2_dev_is_online(ca)) { + b->data_type = old->data_type; + b->dirty_sectors = old->dirty_sectors; + } /* * b->data_type doesn't yet include need_discard & need_gc_gen states - * fix that here: */ - type = __alloc_data_type(b->dirty_sectors, - b->cached_sectors, - b->stripe, - *old, - b->data_type); - if (b->data_type != type) { - struct bch_dev_usage *u; - - preempt_disable(); - u = this_cpu_ptr(ca->usage_gc); - u->d[b->data_type].buckets--; - b->data_type = type; - u->d[b->data_type].buckets++; - preempt_enable(); - } - + b->data_type = __alloc_data_type(b->dirty_sectors, + b->cached_sectors, + b->stripe, + *old, + b->data_type); gc = *b; percpu_up_read(&c->mark_lock); + if (gc.data_type != old_gc.data_type || + gc.dirty_sectors != old_gc.dirty_sectors) + bch2_dev_usage_update_m(c, ca, &old_gc, &gc); + if (metadata_only && gc.data_type != BCH_DATA_sb && gc.data_type != BCH_DATA_journal && -- cgit v1.2.3 From 94817db95681155437100d9f25b3e1390fff8ad6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 8 Mar 2024 14:53:03 -0500 Subject: bcachefs: Correctly validate k->u64s in btree node read path validate_bset_keys() never properly validated k->u64s; it checked if it was 0, but not if it was smaller than keys for the given packed format; this fixes that small oversight. This patch was backported, so it's adding quite a few error enums so that they don't get renumbered and we don't have confusing gaps. Signed-off-by: Kent Overstreet --- fs/bcachefs/bkey.h | 5 +---- fs/bcachefs/btree_io.c | 11 ++++++++++- fs/bcachefs/sb-errors_types.h | 17 ++++++++++++++++- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 831be01809f2..2a8b8fef539c 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -362,10 +362,7 @@ static inline struct bpos bkey_start_pos(const struct bkey *k) static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, const struct bkey_packed *k) { - unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s; - - EBUG_ON(k->u64s < ret); - return ret; + return bkey_packed(k) ? format->key_u64s : BKEY_U64s; } static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index aa9b6cbe3226..caf3eecfc801 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -840,6 +840,9 @@ static bool __bkey_valid(struct bch_fs *c, struct btree *b, if (k->format > KEY_FORMAT_CURRENT) return false; + if (k->u64s < bkeyp_key_u64s(&b->format, k)) + return false; + struct printbuf buf = PRINTBUF; struct bkey tmp; struct bkey_s u = __bkey_disassemble(b, k, &tmp); @@ -881,7 +884,13 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, "invalid bkey format %u", k->format)) goto drop_this_key; - /* XXX: validate k->u64s */ + if (btree_err_on(k->u64s < bkeyp_key_u64s(&b->format, k), + -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bkey_bad_u64s, + "k->u64s too small (%u < %u)", k->u64s, bkeyp_key_u64s(&b->format, k))) + goto drop_this_key; + if (!write) bch2_bkey_compat(b->c.level, b->c.btree_id, version, BSET_BIG_ENDIAN(i), write, diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index c08aacdfd073..2c998e721d90 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -250,7 +250,22 @@ x(hash_table_key_duplicate, 242) \ x(hash_table_key_wrong_offset, 243) \ x(unlinked_inode_not_on_deleted_list, 244) \ - x(reflink_p_front_pad_bad, 245) + x(reflink_p_front_pad_bad, 245) \ + x(journal_entry_dup_same_device, 246) \ + x(inode_bi_subvol_missing, 247) \ + x(inode_bi_subvol_wrong, 248) \ + x(inode_points_to_missing_dirent, 249) \ + x(inode_points_to_wrong_dirent, 250) \ + x(inode_bi_parent_nonzero, 251) \ + x(dirent_to_missing_parent_subvol, 252) \ + x(dirent_not_visible_in_parent_subvol, 253) \ + x(subvol_fs_path_parent_wrong, 254) \ + x(subvol_root_fs_path_parent_nonzero, 255) \ + x(subvol_children_not_set, 256) \ + x(subvol_children_bad, 257) \ + x(subvol_loop, 258) \ + x(subvol_unreachable, 259) \ + x(btree_node_bkey_bad_u64s, 260) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, -- cgit v1.2.3 From fadc6067f2dded5affe0ef281847fa968f1e1354 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jan 2024 16:32:12 -0500 Subject: bcachefs: Set path->uptodate when no node at level We were failing to set path->uptodate when reaching the end of a btree node iterator, causing the new prefetch code for backpointers gc to go into an infinite loop. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index cab2e3fa900b..e6e7649dec79 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1146,7 +1146,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, path = &trans->paths[path_idx]; if (unlikely(path->level >= BTREE_MAX_DEPTH)) - goto out; + goto out_uptodate; path->level = btree_path_up_until_good_node(trans, path, 0); @@ -1179,7 +1179,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, goto out; } } - +out_uptodate: path->uptodate = BTREE_ITER_UPTODATE; out: if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted) -- cgit v1.2.3 From 067f244c9e4d2c00b493291df5f17510fd0bd9c1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 27 Jan 2024 00:31:13 -0500 Subject: bcachefs: fix split brain message Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 73173f63f486..2ca89b315ceb 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1128,7 +1128,7 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, prt_newline(&buf); prt_bdevname(&buf, fs->bdev); - prt_str(&buf, "believes seq of "); + prt_str(&buf, " believes seq of "); prt_bdevname(&buf, sb->bdev); prt_printf(&buf, " to be %llu, but ", seq_from_fs); prt_bdevname(&buf, sb->bdev); -- cgit v1.2.3 From 0be5b38bce6c4d3563f7575f6bac94806b2fbc17 Mon Sep 17 00:00:00 2001 From: Guoyu Ou Date: Tue, 13 Feb 2024 16:20:04 +0800 Subject: bcachefs: skip invisible entries in empty subvolume checking When we are checking whether a subvolume is empty in the specified snapshot, entries that do not belong to this subvolume should be skipped. This fixes the following case: $ bcachefs subvolume create ./sub $ cd sub $ bcachefs subvolume create ./sub2 $ bcachefs subvolume snapshot . ./snap $ ls -a snap . .. $ rmdir snap rmdir: failed to remove 'snap': Directory not empty As Kent suggested, we pass 0 in may_delete_deleted_inode() to ignore subvols in the subvol we are checking, because inode.bi_subvol is only set on subvolume roots, and we can't go through every inode in the subvolume and change bi_subvol when taking a snapshot. It makes the check less strict, but that's ok, the rest of fsck will still catch it. Signed-off-by: Guoyu Ou Signed-off-by: Kent Overstreet --- fs/bcachefs/dirent.c | 7 +++++-- fs/bcachefs/dirent.h | 2 +- fs/bcachefs/inode.c | 5 +++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 4ae1e9f002a0..82c5bff01411 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -508,7 +508,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, return ret; } -int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot) +int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 snapshot) { struct btree_iter iter; struct bkey_s_c k; @@ -518,6 +518,9 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot) SPOS(dir, 0, snapshot), POS(dir, U64_MAX), 0, k, ret) if (k.k->type == KEY_TYPE_dirent) { + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol) + continue; ret = -ENOTEMPTY; break; } @@ -531,7 +534,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) u32 snapshot; return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?: - bch2_empty_dir_snapshot(trans, dir.inum, snapshot); + bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot); } int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 21ffeb78f02e..aeb8207ca9f2 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -69,7 +69,7 @@ u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, const struct bch_hash_info *, const struct qstr *, subvol_inum *); -int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32); +int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32); int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 086f0090b03a..b07ab98e460e 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -1088,8 +1088,9 @@ static int may_delete_deleted_inode(struct btree_trans *trans, goto out; if (S_ISDIR(inode.bi_mode)) { - ret = bch2_empty_dir_snapshot(trans, pos.offset, pos.snapshot); - if (fsck_err_on(ret == -ENOTEMPTY, c, deleted_inode_is_dir, + ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot); + if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY), + c, deleted_inode_is_dir, "non empty directory %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; -- cgit v1.2.3 From 4f70176cb9df0ef452615a64084c3ad70e11c275 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 31 Jan 2024 11:06:59 -0500 Subject: bcachefs: Kill unnecessary wakeups in journal reclaim Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_reclaim.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index c33dca641575..b975b5727471 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -394,8 +394,6 @@ void bch2_journal_pin_copy(struct journal *j, struct journal_entry_pin *src, journal_pin_flush_fn flush_fn) { - bool reclaim; - spin_lock(&j->lock); u64 seq = READ_ONCE(src->seq); @@ -411,44 +409,44 @@ void bch2_journal_pin_copy(struct journal *j, return; } - reclaim = __journal_pin_drop(j, dst); + bool reclaim = __journal_pin_drop(j, dst); bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); - spin_unlock(&j->lock); /* * If the journal is currently full, we might want to call flush_fn * immediately: */ - journal_wake(j); + if (seq == journal_last_seq(j)) + journal_wake(j); + spin_unlock(&j->lock); } void bch2_journal_pin_set(struct journal *j, u64 seq, struct journal_entry_pin *pin, journal_pin_flush_fn flush_fn) { - bool reclaim; - spin_lock(&j->lock); BUG_ON(seq < journal_last_seq(j)); - reclaim = __journal_pin_drop(j, pin); + bool reclaim = __journal_pin_drop(j, pin); bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); - spin_unlock(&j->lock); - /* * If the journal is currently full, we might want to call flush_fn * immediately: */ - journal_wake(j); + if (seq == journal_last_seq(j)) + journal_wake(j); + + spin_unlock(&j->lock); } /** -- cgit v1.2.3 From 656f05d8bd65bd9cc4a07364e9497af55b09e436 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 31 Jan 2024 11:21:46 -0500 Subject: bcachefs: Split out journal workqueue We don't want journal write completions to be blocked behind btree transactions - io_complete_wq is used for btree updates after data and metadata writes. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 22 ++++++++++++---------- fs/bcachefs/journal_io.c | 12 ++++++------ fs/bcachefs/journal_types.h | 1 + 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index bc890776eb57..7c6f3ae47507 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -181,14 +181,12 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) */ void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write) { - struct bch_fs *c = container_of(j, struct bch_fs, journal); - lockdep_assert_held(&j->lock); if (__bch2_journal_pin_put(j, seq)) bch2_journal_reclaim_fast(j); if (write) - closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); + closure_call(&j->io, bch2_journal_write, j->wq, NULL); } /* @@ -418,7 +416,7 @@ static int journal_entry_open(struct journal *j) } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - mod_delayed_work(c->io_complete_wq, + mod_delayed_work(j->wq, &j->write_work, msecs_to_jiffies(c->opts.journal_flush_delay)); journal_wake(j); @@ -445,7 +443,6 @@ static void journal_quiesce(struct journal *j) static void journal_write_work(struct work_struct *work) { struct journal *j = container_of(work, struct journal, write_work.work); - struct bch_fs *c = container_of(j, struct bch_fs, journal); long delta; spin_lock(&j->lock); @@ -455,7 +452,7 @@ static void journal_write_work(struct work_struct *work) delta = journal_cur_buf(j)->expires - jiffies; if (delta > 0) - mod_delayed_work(c->io_complete_wq, &j->write_work, delta); + mod_delayed_work(j->wq, &j->write_work, delta); else __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); unlock: @@ -1303,11 +1300,12 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) void bch2_fs_journal_exit(struct journal *j) { - unsigned i; + if (j->wq) + destroy_workqueue(j->wq); darray_exit(&j->early_journal_entries); - for (i = 0; i < ARRAY_SIZE(j->buf); i++) + for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) kvpfree(j->buf[i].data, j->buf[i].buf_size); free_fifo(&j->pin); } @@ -1315,7 +1313,6 @@ void bch2_fs_journal_exit(struct journal *j) int bch2_fs_journal_init(struct journal *j) { static struct lock_class_key res_key; - unsigned i; mutex_init(&j->buf_lock); spin_lock_init(&j->lock); @@ -1336,7 +1333,7 @@ int bch2_fs_journal_init(struct journal *j) if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) return -BCH_ERR_ENOMEM_journal_pin_fifo; - for (i = 0; i < ARRAY_SIZE(j->buf); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) { j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL); if (!j->buf[i].data) @@ -1344,6 +1341,11 @@ int bch2_fs_journal_init(struct journal *j) } j->pin.front = j->pin.back = 1; + + j->wq = alloc_workqueue("bcachefs_journal", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512); + if (!j->wq) + return -BCH_ERR_ENOMEM_fs_other_alloc; return 0; } diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 47805193f18c..5dcb4f4ceae7 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1648,7 +1648,7 @@ static CLOSURE_CALLBACK(journal_write_done) if (!journal_state_count(new, new.unwritten_idx) && journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { spin_unlock(&j->lock); - closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); + closure_call(&j->io, bch2_journal_write, j->wq, NULL); } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { struct journal_buf *buf = journal_cur_buf(j); @@ -1661,7 +1661,7 @@ static CLOSURE_CALLBACK(journal_write_done) */ spin_unlock(&j->lock); - mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta)); + mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); } else { spin_unlock(&j->lock); } @@ -1731,7 +1731,7 @@ static CLOSURE_CALLBACK(do_journal_write) le64_to_cpu(w->data->seq); } - continue_at(cl, journal_write_done, c->io_complete_wq); + continue_at(cl, journal_write_done, j->wq); } static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) @@ -1998,12 +1998,12 @@ CLOSURE_CALLBACK(bch2_journal_write) } } - continue_at(cl, do_journal_write, c->io_complete_wq); + continue_at(cl, do_journal_write, j->wq); return; no_io: - continue_at(cl, journal_write_done, c->io_complete_wq); + continue_at(cl, journal_write_done, j->wq); return; err: bch2_fatal_error(c); - continue_at(cl, journal_write_done, c->io_complete_wq); + continue_at(cl, journal_write_done, j->wq); } diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 38817c7a0851..2ca5d5014cf6 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -205,6 +205,7 @@ struct journal { struct closure io; struct delayed_work write_work; + struct workqueue_struct *wq; /* Sequence number of most recent journal entry (last entry in @pin) */ atomic64_t seq; -- cgit v1.2.3 From a4e92339115d10f07fac051454616dbe6dbd47f5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 31 Jan 2024 11:24:37 -0500 Subject: bcachefs: Avoid setting j->write_work unnecessarily Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 7c6f3ae47507..04309e7c108b 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -416,9 +416,10 @@ static int journal_entry_open(struct journal *j) } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - mod_delayed_work(j->wq, - &j->write_work, - msecs_to_jiffies(c->opts.journal_flush_delay)); + if (nr_unwritten_journal_entries(j) == 1) + mod_delayed_work(j->wq, + &j->write_work, + msecs_to_jiffies(c->opts.journal_flush_delay)); journal_wake(j); if (j->early_journal_entries.nr) @@ -443,19 +444,16 @@ static void journal_quiesce(struct journal *j) static void journal_write_work(struct work_struct *work) { struct journal *j = container_of(work, struct journal, write_work.work); - long delta; spin_lock(&j->lock); - if (!__journal_entry_is_open(j->reservations)) - goto unlock; - - delta = journal_cur_buf(j)->expires - jiffies; + if (__journal_entry_is_open(j->reservations)) { + long delta = journal_cur_buf(j)->expires - jiffies; - if (delta > 0) - mod_delayed_work(j->wq, &j->write_work, delta); - else - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); -unlock: + if (delta > 0) + mod_delayed_work(j->wq, &j->write_work, delta); + else + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); + } spin_unlock(&j->lock); } -- cgit v1.2.3 From bdec47f57f26248a6fba67803c2ee44484534735 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 31 Jan 2024 11:25:46 -0500 Subject: bcachefs: Journal writes should be REQ_SYNC|REQ_META Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 5dcb4f4ceae7..8047425e84c3 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1991,7 +1991,7 @@ CLOSURE_CALLBACK(bch2_journal_write) bio = ca->journal.bio; bio_reset(bio, ca->disk_sb.bdev, - REQ_OP_WRITE|REQ_PREFLUSH); + REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH); bio->bi_end_io = journal_write_endio; bio->bi_private = ca; closure_bio_submit(bio, cl); -- cgit v1.2.3 From e6fab655e6f59d1b50ee9e97c727a6f6f84b0f1d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 31 Jan 2024 11:28:13 -0500 Subject: bcachefs: Avoid taking journal lock unnecessarily Previously, any time we failed to get a journal reservation we'd retry, with the journal lock held; but this isn't necessary given wait_event()/wake_up() ordering. This avoids performance cliffs when the journal starts to get backed up and lock contention shoots up. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 107 ++++++++++++++++++++++---------------------- fs/bcachefs/journal_types.h | 1 + 2 files changed, 55 insertions(+), 53 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 04309e7c108b..20fdacb86f51 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -27,6 +27,26 @@ static const char * const bch2_journal_errors[] = { NULL }; +static inline bool journal_seq_unwritten(struct journal *j, u64 seq) +{ + return seq > j->seq_ondisk; +} + +static bool __journal_entry_is_open(union journal_res_state state) +{ + return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; +} + +static inline unsigned nr_unwritten_journal_entries(struct journal *j) +{ + return atomic64_read(&j->seq) - j->seq_ondisk; +} + +static bool journal_entry_is_open(struct journal *j) +{ + return __journal_entry_is_open(j->reservations); +} + static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq) { union journal_res_state s = READ_ONCE(j->reservations); @@ -66,26 +86,7 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) seq <= journal_cur_seq(j); seq++) bch2_journal_buf_to_text(out, j, seq); -} - -static inline bool journal_seq_unwritten(struct journal *j, u64 seq) -{ - return seq > j->seq_ondisk; -} - -static bool __journal_entry_is_open(union journal_res_state state) -{ - return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; -} - -static inline unsigned nr_unwritten_journal_entries(struct journal *j) -{ - return atomic64_read(&j->seq) - j->seq_ondisk; -} - -static bool journal_entry_is_open(struct journal *j) -{ - return __journal_entry_is_open(j->reservations); + prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed"); } static inline struct journal_buf * @@ -468,33 +469,32 @@ retry: if (journal_res_get_fast(j, res, flags)) return 0; - if (bch2_journal_error(j)) - return -BCH_ERR_erofs_journal_err; + if ((flags & BCH_WATERMARK_MASK) < j->watermark) { + ret = JOURNAL_ERR_journal_full; + can_discard = j->can_discard; + goto out; + } - spin_lock(&j->lock); + if (j->blocked) + return -BCH_ERR_journal_res_get_blocked; - /* check once more in case somebody else shut things down... */ - if (bch2_journal_error(j)) { - spin_unlock(&j->lock); + if (bch2_journal_error(j)) return -BCH_ERR_erofs_journal_err; + + if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) { + ret = JOURNAL_ERR_max_in_flight; + goto out; } + spin_lock(&j->lock); + /* * Recheck after taking the lock, so we don't race with another thread * that just did journal_entry_open() and call bch2_journal_entry_close() * unnecessarily */ if (journal_res_get_fast(j, res, flags)) { - spin_unlock(&j->lock); - return 0; - } - - if ((flags & BCH_WATERMARK_MASK) < j->watermark) { - /* - * Don't want to close current journal entry, just need to - * invoke reclaim: - */ - ret = JOURNAL_ERR_journal_full; + ret = 0; goto unlock; } @@ -510,30 +510,31 @@ retry: j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false); - ret = journal_entry_open(j); - - if (ret == JOURNAL_ERR_max_in_flight) { - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], - &j->max_in_flight_start, true); - if (trace_journal_entry_full_enabled()) { - struct printbuf buf = PRINTBUF; - buf.atomic++; - - bch2_journal_bufs_to_text(&buf, j); - trace_journal_entry_full(c, buf.buf); - printbuf_exit(&buf); - } - count_event(c, journal_entry_full); - } + ret = journal_entry_open(j) ?: JOURNAL_ERR_retry; unlock: can_discard = j->can_discard; spin_unlock(&j->lock); - - if (!ret) +out: + if (ret == JOURNAL_ERR_retry) goto retry; + if (!ret) + return 0; + if (journal_error_check_stuck(j, ret, flags)) ret = -BCH_ERR_journal_res_get_blocked; + if (ret == JOURNAL_ERR_max_in_flight && + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], + &j->max_in_flight_start, true)) { + + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); + bch2_journal_bufs_to_text(&buf, j); + trace_journal_entry_full(c, buf.buf); + printbuf_exit(&buf); + count_event(c, journal_entry_full); + } + /* * Journal is full - can't rely on reclaim from work item due to * freezing: diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 2ca5d5014cf6..1493c262eaf4 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -134,6 +134,7 @@ enum journal_flags { /* Reasons we may fail to get a journal reservation: */ #define JOURNAL_ERRORS() \ x(ok) \ + x(retry) \ x(blocked) \ x(max_in_flight) \ x(journal_full) \ -- cgit v1.2.3 From 7b05ecbafc174c4bac9546c41eae0cf5efda827a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 16 Jan 2024 17:29:15 -0500 Subject: bcachefs: fixup for building in userspace Signed-off-by: Kent Overstreet --- fs/bcachefs/checksum.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index 3c761ad6b1c8..4701457f6381 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -558,7 +558,7 @@ got_key: return 0; } -#include "../crypto.h" +#include "crypto.h" #endif int bch2_request_key(struct bch_sb *sb, struct bch_key *key) -- cgit v1.2.3 From 3f305e04984634d62ce5c46a2c97ef000447e6d9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 21 Jan 2024 17:46:14 -0500 Subject: bcachefs: Improve bch2_dirent_to_text() For DT_SUBVOL, we now print both parent and child subvol IDs. Signed-off-by: Kent Overstreet --- fs/bcachefs/dirent.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 82c5bff01411..86b36874bb7a 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -144,19 +144,21 @@ fsck_err: return ret; } -void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) +void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); struct qstr d_name = bch2_dirent_get_name(d); - prt_printf(out, "%.*s -> %llu type %s", - d_name.len, - d_name.name, - d.v->d_type != DT_SUBVOL - ? le64_to_cpu(d.v->d_inum) - : le32_to_cpu(d.v->d_child_subvol), - bch2_d_type_str(d.v->d_type)); + prt_printf(out, "%.*s -> ", d_name.len, d_name.name); + + if (d.v->d_type != DT_SUBVOL) + prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum)); + else + prt_printf(out, "%u -> %u", + le32_to_cpu(d.v->d_parent_subvol), + le32_to_cpu(d.v->d_child_subvol)); + + prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type)); } static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, -- cgit v1.2.3 From 6b83aee8a41b12b1985d4a77b5ae11f9e3491744 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 22 Jan 2024 20:55:08 -0500 Subject: bcachefs: Workqueues should be WQ_HIGHPRI Most bcachefs workqueues are used for completions, and should be WQ_HIGHPRI - this helps reduce queuing delays, we want to complete quickly once we can no longer signal backpressure by blocking. Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 2ca89b315ceb..b91efec11dfe 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -862,13 +862,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); if (!(c->btree_update_wq = alloc_workqueue("bcachefs", - WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) || + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", - WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", - WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || !(c->io_complete_wq = alloc_workqueue("bcachefs_io", - WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 512)) || + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", WQ_FREEZABLE, 0)) || #ifndef BCH_WRITE_REF_DEBUG -- cgit v1.2.3 From 23f25223157c60b00f86d74d141772c5c1996ae4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 25 Jan 2024 19:57:26 -0500 Subject: bcachefs: bch2_hash_set_snapshot() -> bch2_hash_set_in_snapshot() Minor renaming for clarity, bit of refactoring. Signed-off-by: Kent Overstreet --- fs/bcachefs/dirent.c | 8 ++++---- fs/bcachefs/fsck.c | 7 +++---- fs/bcachefs/str_hash.h | 15 +++++---------- 3 files changed, 12 insertions(+), 18 deletions(-) diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 86b36874bb7a..3ea98e13e34f 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -219,10 +219,10 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, dirent->k.p.inode = dir; dirent->k.p.snapshot = snapshot; - ret = bch2_hash_set_snapshot(trans, bch2_dirent_hash_desc, hash_info, - zero_inum, snapshot, - &dirent->k_i, str_hash_flags, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, + zero_inum, snapshot, + &dirent->k_i, str_hash_flags, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); *dir_offset = dirent->k.p.offset; return ret; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index ac4e40ad168d..e92efc944823 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -100,8 +100,8 @@ err: } static int lookup_inode(struct btree_trans *trans, u64 inode_nr, - struct bch_inode_unpacked *inode, - u32 *snapshot) + struct bch_inode_unpacked *inode, + u32 *snapshot) { struct btree_iter iter; struct bkey_s_c k; @@ -722,7 +722,7 @@ static int hash_redo_key(struct btree_trans *trans, delete->k.p = k_iter->pos; return bch2_btree_iter_traverse(k_iter) ?: bch2_trans_update(trans, k_iter, delete, 0) ?: - bch2_hash_set_snapshot(trans, desc, hash_info, + bch2_hash_set_in_snapshot(trans, desc, hash_info, (subvol_inum) { 0, k.k->p.inode }, k.k->p.snapshot, tmp, BCH_HASH_SET_MUST_CREATE, @@ -1778,7 +1778,6 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (d.v->d_type == DT_DIR) for_each_visible_inode(c, s, dir, equiv.snapshot, i) i->count++; - out: err: fsck_err: diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index fcaa5a888744..3976f80721bf 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -259,7 +259,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, } static __always_inline -int bch2_hash_set_snapshot(struct btree_trans *trans, +int bch2_hash_set_in_snapshot(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, u32 snapshot, @@ -328,17 +328,12 @@ int bch2_hash_set(struct btree_trans *trans, struct bkey_i *insert, bch_str_hash_flags_t str_hash_flags) { - u32 snapshot; - int ret; - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - return ret; - insert->k.p.inode = inum.inum; - return bch2_hash_set_snapshot(trans, desc, info, inum, - snapshot, insert, str_hash_flags, 0); + u32 snapshot; + return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: + bch2_hash_set_in_snapshot(trans, desc, info, inum, + snapshot, insert, str_hash_flags, 0); } static __always_inline -- cgit v1.2.3 From 5b6271b5091263bb705f9634917b41ce980d2ba7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 25 Jan 2024 12:35:06 -0500 Subject: bcachefs: Cleanup bch2_dirent_lookup_trans() Drop an unnecessary bch2_subvolume_get_snapshot() call, and drop the __ from the name - this is a normal interface. Signed-off-by: Kent Overstreet --- fs/bcachefs/dirent.c | 34 +++++++++++----------------------- fs/bcachefs/dirent.h | 2 +- fs/bcachefs/fs-common.c | 4 ++-- 3 files changed, 14 insertions(+), 26 deletions(-) diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 3ea98e13e34f..8a4572594545 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -458,41 +458,29 @@ out: return ret; } -int __bch2_dirent_lookup_trans(struct btree_trans *trans, - struct btree_iter *iter, - subvol_inum dir, - const struct bch_hash_info *hash_info, - const struct qstr *name, subvol_inum *inum, - unsigned flags) +int bch2_dirent_lookup_trans(struct btree_trans *trans, + struct btree_iter *iter, + subvol_inum dir, + const struct bch_hash_info *hash_info, + const struct qstr *name, subvol_inum *inum, + unsigned flags) { - struct bkey_s_c k; - struct bkey_s_c_dirent d; - u32 snapshot; - int ret; - - ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); + int ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, + hash_info, dir, name, flags); if (ret) return ret; - ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, - hash_info, dir, name, flags); - if (ret) - return ret; - - k = bch2_btree_iter_peek_slot(iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) goto err; - d = bkey_s_c_to_dirent(k); - - ret = bch2_dirent_read_target(trans, dir, d, inum); + ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum); if (ret > 0) ret = -ENOENT; err: if (ret) bch2_trans_iter_exit(trans, iter); - return ret; } @@ -504,7 +492,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, struct btree_iter iter = { NULL }; int ret = lockrestart_do(trans, - __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0)); + bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0)); bch2_trans_iter_exit(trans, &iter); bch2_trans_put(trans); return ret; diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index aeb8207ca9f2..1cd033fbc58a 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -62,7 +62,7 @@ int bch2_dirent_rename(struct btree_trans *, const struct qstr *, subvol_inum *, u64 *, enum bch_rename_mode); -int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, +int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, subvol_inum, const struct bch_hash_info *, const struct qstr *, subvol_inum *, unsigned); u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index 1c1ea0f0c692..8ee716e4c2e7 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -260,8 +260,8 @@ int bch2_unlink_trans(struct btree_trans *trans, dir_hash = bch2_hash_info_init(c, dir_u); - ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, - name, &inum, BTREE_ITER_INTENT); + ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, + name, &inum, BTREE_ITER_INTENT); if (ret) goto err; -- cgit v1.2.3 From a555bcf4fa8dea612564d1133dad881c6681bae3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 27 Jan 2024 00:05:03 -0500 Subject: bcachefs: convert journal replay ptrs to darray Eliminates some error paths - no longer have a hardcoded BCH_REPLICAS_MAX limit. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 5 ++-- fs/bcachefs/journal_io.c | 70 ++++++++++++++++-------------------------------- fs/bcachefs/journal_io.h | 19 +++++++------ 3 files changed, 36 insertions(+), 58 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 20fdacb86f51..edbbedaa0def 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1153,7 +1153,6 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) struct journal_replay *i, **_i; struct genradix_iter iter; bool had_entries = false; - unsigned ptr; u64 last_seq = cur_seq, nr, seq; genradix_for_each_reverse(&c->journal_entries, iter, _i) { @@ -1207,8 +1206,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) p = journal_seq_pin(j, seq); p->devs.nr = 0; - for (ptr = 0; ptr < i->nr_ptrs; ptr++) - bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev); + darray_for_each(i->ptrs, ptr) + bch2_dev_list_add_dev(&p->devs, ptr->dev); had_entries = true; } diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 8047425e84c3..8cfb8f1fe02d 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -84,7 +84,6 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, { struct genradix_iter iter; struct journal_replay **_i, *i, *dup; - struct journal_ptr *ptr; size_t bytes = vstruct_bytes(j); u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; int ret = JOURNAL_ENTRY_ADD_OK; @@ -156,45 +155,29 @@ replace: if (!i) return -BCH_ERR_ENOMEM_journal_entry_add; - i->nr_ptrs = 0; + darray_init(&i->ptrs); i->csum_good = entry_ptr.csum_good; i->ignore = false; unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); - i->ptrs[i->nr_ptrs++] = entry_ptr; + darray_push(&i->ptrs, entry_ptr); if (dup) { - if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) { - bch_err(c, "found too many copies of journal entry %llu", - le64_to_cpu(i->j.seq)); - dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1; - } - /* The first ptr should represent the jset we kept: */ - memcpy(i->ptrs + i->nr_ptrs, - dup->ptrs, - sizeof(dup->ptrs[0]) * dup->nr_ptrs); - i->nr_ptrs += dup->nr_ptrs; + darray_for_each(dup->ptrs, ptr) + darray_push(&i->ptrs, *ptr); __journal_replay_free(c, dup); } *_i = i; - return 0; found: - for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) { + darray_for_each(i->ptrs, ptr) if (ptr->dev == ca->dev_idx) { bch_err(c, "duplicate journal entry %llu on same device", le64_to_cpu(i->j.seq)); goto out; } - } - if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) { - bch_err(c, "found too many copies of journal entry %llu", - le64_to_cpu(i->j.seq)); - goto out; - } - - i->ptrs[i->nr_ptrs++] = entry_ptr; + ret = darray_push(&i->ptrs, entry_ptr); out: fsck_err: return ret; @@ -1102,16 +1085,15 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) if (!r) continue; - for (i = 0; i < r->nr_ptrs; i++) { - if (r->ptrs[i].dev == ca->dev_idx) { - unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) + + darray_for_each(r->ptrs, i) + if (i->dev == ca->dev_idx) { + unsigned wrote = bucket_remainder(ca, i->sector) + vstruct_sectors(&r->j, c->block_bits); - ja->cur_idx = r->ptrs[i].bucket; + ja->cur_idx = i->bucket; ja->sectors_free = ca->mi.bucket_size - wrote; goto found; } - } } found: mutex_unlock(&jlist->lock); @@ -1158,21 +1140,16 @@ err: void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct journal_replay *j) { - unsigned i; - - for (i = 0; i < j->nr_ptrs; i++) { - struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev); + darray_for_each(j->ptrs, i) { + struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev); u64 offset; - div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset); + div64_u64_rem(i->sector, ca->mi.bucket_size, &offset); - if (i) + if (i != j->ptrs.data) prt_printf(out, " "); prt_printf(out, "%u:%u:%u (sector %llu)", - j->ptrs[i].dev, - j->ptrs[i].bucket, - j->ptrs[i].bucket_offset, - j->ptrs[i].sector); + i->dev, i->bucket, i->bucket_offset, i->sector); } } @@ -1353,32 +1330,31 @@ int bch2_journal_read(struct bch_fs *c, .e.data_type = BCH_DATA_journal, .e.nr_required = 1, }; - unsigned ptr; i = *_i; if (!i || i->ignore) continue; - for (ptr = 0; ptr < i->nr_ptrs; ptr++) { - struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); + darray_for_each(i->ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - if (!i->ptrs[ptr].csum_good) - bch_err_dev_offset(ca, i->ptrs[ptr].sector, + if (!ptr->csum_good) + bch_err_dev_offset(ca, ptr->sector, "invalid journal checksum, seq %llu%s", le64_to_cpu(i->j.seq), i->csum_good ? " (had good copy on another device)" : ""); } ret = jset_validate(c, - bch_dev_bkey_exists(c, i->ptrs[0].dev), + bch_dev_bkey_exists(c, i->ptrs.data[0].dev), &i->j, - i->ptrs[0].sector, + i->ptrs.data[0].sector, READ); if (ret) goto err; - for (ptr = 0; ptr < i->nr_ptrs; ptr++) - replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; + darray_for_each(i->ptrs, ptr) + replicas.e.devs[replicas.e.nr_devs++] = ptr->dev; bch2_replicas_entry_sort(&replicas.e); diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index c035e7c108e1..1e0b9a571648 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -2,19 +2,22 @@ #ifndef _BCACHEFS_JOURNAL_IO_H #define _BCACHEFS_JOURNAL_IO_H +#include "darray.h" + +struct journal_ptr { + bool csum_good; + u8 dev; + u32 bucket; + u32 bucket_offset; + u64 sector; +}; + /* * Only used for holding the journal entries we read in btree_journal_read() * during cache_registration */ struct journal_replay { - struct journal_ptr { - bool csum_good; - u8 dev; - u32 bucket; - u32 bucket_offset; - u64 sector; - } ptrs[BCH_REPLICAS_MAX]; - unsigned nr_ptrs; + DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs; bool csum_good; bool ignore; -- cgit v1.2.3 From 3d3d23b341109fe349b1f448c2d89d2ee4429415 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 27 Jan 2024 10:01:23 -0500 Subject: bcachefs: improve journal entry read fsck error messages Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 96 +++++++++++++++++++++++++++--------------------- 1 file changed, 55 insertions(+), 41 deletions(-) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 8cfb8f1fe02d..66ce522950ff 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -17,6 +17,30 @@ #include "sb-clean.h" #include "trace.h" +void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct journal_replay *j) +{ + darray_for_each(j->ptrs, i) { + struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev); + u64 offset; + + div64_u64_rem(i->sector, ca->mi.bucket_size, &offset); + + if (i != j->ptrs.data) + prt_printf(out, " "); + prt_printf(out, "%u:%u:%u (sector %llu)", + i->dev, i->bucket, i->bucket_offset, i->sector); + } +} + +static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, + struct journal_replay *j) +{ + prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); + + bch2_journal_ptrs_to_text(out, c, j); +} + static struct nonce journal_nonce(const struct jset *jset) { return (struct nonce) {{ @@ -86,6 +110,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, struct journal_replay **_i, *i, *dup; size_t bytes = vstruct_bytes(j); u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; + struct printbuf buf = PRINTBUF; int ret = JOURNAL_ENTRY_ADD_OK; /* Is this entry older than the range we need? */ @@ -130,25 +155,37 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, */ dup = *_i; if (dup) { - if (bytes == vstruct_bytes(&dup->j) && - !memcmp(j, &dup->j, bytes)) { - i = dup; - goto found; - } + bool identical = bytes == vstruct_bytes(&dup->j) && + !memcmp(j, &dup->j, bytes); + bool not_identical = !identical && + entry_ptr.csum_good && + dup->csum_good; - if (!entry_ptr.csum_good) { - i = dup; - goto found; - } + bool same_device = false; + darray_for_each(dup->ptrs, ptr) + if (ptr->dev == ca->dev_idx) + same_device = true; + + ret = darray_push(&dup->ptrs, entry_ptr); + if (ret) + goto out; - if (!dup->csum_good) + bch2_journal_replay_to_text(&buf, c, dup); + + fsck_err_on(same_device, + c, journal_entry_dup_same_device, + "duplicate journal entry on same device\n %s", + buf.buf); + + fsck_err_on(not_identical, + c, journal_entry_replicas_data_mismatch, + "found duplicate but non identical journal entries\n %s", + buf.buf); + + if (entry_ptr.csum_good && !identical) goto replace; - fsck_err(c, journal_entry_replicas_data_mismatch, - "found duplicate but non identical journal entries (seq %llu)", - le64_to_cpu(j->seq)); - i = dup; - goto found; + goto out; } replace: i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); @@ -159,27 +196,20 @@ replace: i->csum_good = entry_ptr.csum_good; i->ignore = false; unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); - darray_push(&i->ptrs, entry_ptr); if (dup) { /* The first ptr should represent the jset we kept: */ darray_for_each(dup->ptrs, ptr) darray_push(&i->ptrs, *ptr); __journal_replay_free(c, dup); + } else { + darray_push(&i->ptrs, entry_ptr); } *_i = i; -found: - darray_for_each(i->ptrs, ptr) - if (ptr->dev == ca->dev_idx) { - bch_err(c, "duplicate journal entry %llu on same device", - le64_to_cpu(i->j.seq)); - goto out; - } - - ret = darray_push(&i->ptrs, entry_ptr); out: fsck_err: + printbuf_exit(&buf); return ret; } @@ -1137,22 +1167,6 @@ err: goto out; } -void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - struct journal_replay *j) -{ - darray_for_each(j->ptrs, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev); - u64 offset; - - div64_u64_rem(i->sector, ca->mi.bucket_size, &offset); - - if (i != j->ptrs.data) - prt_printf(out, " "); - prt_printf(out, "%u:%u:%u (sector %llu)", - i->dev, i->bucket, i->bucket_offset, i->sector); - } -} - int bch2_journal_read(struct bch_fs *c, u64 *last_seq, u64 *blacklist_seq, -- cgit v1.2.3 From 52f7d75e7d36ae4258c4ceef4cb86dc2950e7906 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 27 Jan 2024 10:16:15 -0500 Subject: bcachefs: jset_entry_datetime This gives us a way to record the date and time every journal entry was written - useful for debugging. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 8 +++++++- fs/bcachefs/journal_io.c | 44 +++++++++++++++++++++++++++++++++++++++++++ fs/bcachefs/journal_io.h | 16 ++++++++++++++++ fs/bcachefs/sb-clean.c | 16 ---------------- 4 files changed, 67 insertions(+), 17 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 0668b682a21c..14f613617913 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1275,7 +1275,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(dev_usage, 8) \ x(log, 9) \ x(overwrite, 10) \ - x(write_buffer_keys, 11) + x(write_buffer_keys, 11) \ + x(datetime, 12) enum { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, @@ -1376,6 +1377,11 @@ struct jset_entry_log { u8 d[]; } __packed __aligned(8); +struct jset_entry_datetime { + struct jset_entry entry; + __le64 seconds; +} __packed __aligned(8); + /* * On disk format for a journal entry: * seq is monotonically increasing; every journal entry has its own unique diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 66ce522950ff..0ca6d976f4d5 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -39,6 +39,14 @@ static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); bch2_journal_ptrs_to_text(out, c, j); + + struct jset_entry *entry; + for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) { + struct jset_entry_datetime *datetime = + container_of(entry, struct jset_entry_datetime, entry); + bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); + break; + } } static struct nonce journal_nonce(const struct jset *jset) @@ -754,6 +762,37 @@ static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct journal_entry_btree_keys_to_text(out, c, entry); } +static int journal_entry_datetime_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) +{ + unsigned bytes = vstruct_bytes(entry); + unsigned expected = 16; + int ret = 0; + + if (journal_entry_err_on(vstruct_bytes(entry) < expected, + c, version, jset, entry, + journal_entry_dev_usage_bad_size, + "bad size (%u < %u)", + bytes, expected)) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } +fsck_err: + return ret; +} + +static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_datetime *datetime = + container_of(entry, struct jset_entry_datetime, entry); + + bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); +} + struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, @@ -1794,6 +1833,11 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); + struct jset_entry_datetime *d = + container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry); + d->entry.type = BCH_JSET_ENTRY_datetime; + d->seconds = cpu_to_le64(ktime_get_real_seconds()); + bch2_journal_super_entries_add_common(c, &end, seq); u64s = (u64 *) end - (u64 *) start; BUG_ON(u64s > j->entry_u64s_reserved); diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index 1e0b9a571648..1f395f43cf76 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -65,4 +65,20 @@ int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *); CLOSURE_CALLBACK(bch2_journal_write); +static inline struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) +{ + struct jset_entry *entry = *end; + unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); + + memset(entry, 0, u64s * sizeof(u64)); + /* + * The u64s field counts from the start of data, ignoring the shared + * fields. + */ + entry->u64s = cpu_to_le16(u64s - 1); + + *end = vstruct_next(*end); + return entry; +} + #endif /* _BCACHEFS_JOURNAL_IO_H */ diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c index b6bf0ebe7e84..5980ba2563fe 100644 --- a/fs/bcachefs/sb-clean.c +++ b/fs/bcachefs/sb-clean.c @@ -171,22 +171,6 @@ fsck_err: return ERR_PTR(ret); } -static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) -{ - struct jset_entry *entry = *end; - unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); - - memset(entry, 0, u64s * sizeof(u64)); - /* - * The u64s field counts from the start of data, ignoring the shared - * fields. - */ - entry->u64s = cpu_to_le16(u64s - 1); - - *end = vstruct_next(*end); - return entry; -} - void bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry **end, u64 journal_seq) -- cgit v1.2.3 From 51654002755b34b0bdedd03dd381bd6d46a657a5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 31 Jan 2024 13:20:28 -0500 Subject: bcachefs: bio per journal buf Prep work for having multiple journal writes in flight. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 40 ++++++++++++++++++++++------------------ fs/bcachefs/journal_io.c | 21 +++++++++++---------- fs/bcachefs/journal_types.h | 2 +- 3 files changed, 34 insertions(+), 29 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index edbbedaa0def..f714fc7238f8 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1235,13 +1235,17 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) void bch2_dev_journal_exit(struct bch_dev *ca) { - kfree(ca->journal.bio); - kfree(ca->journal.buckets); - kfree(ca->journal.bucket_seq); + struct journal_device *ja = &ca->journal; + + for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { + kfree(ja->bio[i]); + ja->bio[i] = NULL; + } - ca->journal.bio = NULL; - ca->journal.buckets = NULL; - ca->journal.bucket_seq = NULL; + kfree(ja->buckets); + kfree(ja->bucket_seq); + ja->buckets = NULL; + ja->bucket_seq = NULL; } int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) @@ -1251,14 +1255,13 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) bch2_sb_field_get(sb, journal); struct bch_sb_field_journal_v2 *journal_buckets_v2 = bch2_sb_field_get(sb, journal_v2); - unsigned i, nr_bvecs; ja->nr = 0; if (journal_buckets_v2) { unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); - for (i = 0; i < nr; i++) + for (unsigned i = 0; i < nr; i++) ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr); } else if (journal_buckets) { ja->nr = bch2_nr_journal_buckets(journal_buckets); @@ -1268,13 +1271,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) if (!ja->bucket_seq) return -BCH_ERR_ENOMEM_dev_journal_init; - nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); + unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); - ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); - if (!ca->journal.bio) - return -BCH_ERR_ENOMEM_dev_journal_init; - - bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0); + for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { + ja->bio[i] = bio_kmalloc(nr_bvecs, GFP_KERNEL); + if (!ja->bio[i]) + return -BCH_ERR_ENOMEM_dev_journal_init; + bio_init(ja->bio[i], NULL, ja->bio[i]->bi_inline_vecs, nr_bvecs, 0); + } ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); if (!ja->buckets) @@ -1282,14 +1286,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) if (journal_buckets_v2) { unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); - unsigned j, dst = 0; + unsigned dst = 0; - for (i = 0; i < nr; i++) - for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) + for (unsigned i = 0; i < nr; i++) + for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) ja->buckets[dst++] = le64_to_cpu(journal_buckets_v2->d[i].start) + j; } else if (journal_buckets) { - for (i = 0; i < ja->nr; i++) + for (unsigned i = 0; i < ja->nr; i++) ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); } diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 0ca6d976f4d5..1dc8318e1f14 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1721,13 +1721,14 @@ static CLOSURE_CALLBACK(do_journal_write) { closure_type(j, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; - struct journal_buf *w = journal_last_unwritten_buf(j); - struct bio *bio; + unsigned buf_idx = journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK; + struct journal_buf *w = j->buf + buf_idx; unsigned sectors = vstruct_sectors(w->data, c->block_bits); extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct journal_device *ja = &ca->journal; + if (!percpu_ref_tryget(&ca->io_ref)) { /* XXX: fix this */ bch_err(c, "missing device for journal write\n"); @@ -1737,7 +1738,7 @@ static CLOSURE_CALLBACK(do_journal_write) this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], sectors); - bio = ca->journal.bio; + struct bio *bio = ja->bio[buf_idx]; bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = ptr->offset; bio->bi_end_io = journal_write_endio; @@ -1756,8 +1757,7 @@ static CLOSURE_CALLBACK(do_journal_write) trace_and_count(c, journal_write, bio); closure_bio_submit(bio, cl); - ca->journal.bucket_seq[ca->journal.cur_idx] = - le64_to_cpu(w->data->seq); + ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); } continue_at(cl, journal_write_done, j->wq); @@ -1939,9 +1939,9 @@ CLOSURE_CALLBACK(bch2_journal_write) { closure_type(j, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *w = journal_last_unwritten_buf(j); + unsigned buf_idx = journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK; + struct journal_buf *w = j->buf + buf_idx; struct bch_replicas_padded replicas; - struct bio *bio; struct printbuf journal_debug_buf = PRINTBUF; unsigned nr_rw_members = 0; int ret; @@ -2023,7 +2023,8 @@ CLOSURE_CALLBACK(bch2_journal_write) for_each_rw_member(c, ca) { percpu_ref_get(&ca->io_ref); - bio = ca->journal.bio; + struct journal_device *ja = &ca->journal; + struct bio *bio = ja->bio[buf_idx]; bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH); bio->bi_end_io = journal_write_endio; diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 1493c262eaf4..79db1daa1de2 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -315,7 +315,7 @@ struct journal_device { u64 *buckets; /* Bio for journal reads/writes to this device */ - struct bio *bio; + struct bio *bio[JOURNAL_BUF_NR]; /* for bch_journal_read_device */ struct closure read; -- cgit v1.2.3 From 38789c25087471121cfa42333bfb249eae539682 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 31 Jan 2024 13:42:48 -0500 Subject: bcachefs: closure per journal buf Prep work for having multiple journal writes in flight. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 18 ++++++++++++------ fs/bcachefs/journal_io.c | 34 +++++++++++++++++++--------------- fs/bcachefs/journal_types.h | 12 ++++++++++-- 3 files changed, 41 insertions(+), 23 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index f714fc7238f8..963933930366 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -186,8 +186,10 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write) if (__bch2_journal_pin_put(j, seq)) bch2_journal_reclaim_fast(j); - if (write) - closure_call(&j->io, bch2_journal_write, j->wq, NULL); + if (write) { + struct journal_buf *w = j->buf + (seq & JOURNAL_BUF_MASK); + closure_call(&w->io, bch2_journal_write, j->wq, NULL); + } } /* @@ -1274,10 +1276,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { - ja->bio[i] = bio_kmalloc(nr_bvecs, GFP_KERNEL); + ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, + nr_bvecs), GFP_KERNEL); if (!ja->bio[i]) return -BCH_ERR_ENOMEM_dev_journal_init; - bio_init(ja->bio[i], NULL, ja->bio[i]->bi_inline_vecs, nr_bvecs, 0); + + ja->bio[i]->ca = ca; + ja->bio[i]->buf_idx = i; + bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0); } ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); @@ -1340,6 +1346,7 @@ int bch2_fs_journal_init(struct journal *j) j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL); if (!j->buf[i].data) return -BCH_ERR_ENOMEM_journal_buf; + j->buf[i].idx = i; } j->pin.front = j->pin.back = 1; @@ -1459,7 +1466,6 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 { struct journal_entry_pin_list *pin_list; struct journal_entry_pin *pin; - unsigned i; spin_lock(&j->lock); *seq = max(*seq, j->pin.front); @@ -1477,7 +1483,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 prt_newline(out); printbuf_indent_add(out, 2); - for (i = 0; i < ARRAY_SIZE(pin_list->list); i++) + for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++) list_for_each_entry(pin, &pin_list->list[i], list) { prt_printf(out, "\t%px %ps", pin, pin->flush); prt_newline(out); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 1dc8318e1f14..d02e49956621 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1597,9 +1597,9 @@ static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) static CLOSURE_CALLBACK(journal_write_done) { - closure_type(j, struct journal, io); + closure_type(w, struct journal_buf, io); + struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *w = journal_last_unwritten_buf(j); struct bch_replicas_padded replicas; union journal_res_state old, new; u64 v, seq; @@ -1676,8 +1676,9 @@ static CLOSURE_CALLBACK(journal_write_done) if (!journal_state_count(new, new.unwritten_idx) && journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { + struct journal_buf *w = j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); spin_unlock(&j->lock); - closure_call(&j->io, bch2_journal_write, j->wq, NULL); + closure_call(&w->io, bch2_journal_write, j->wq, NULL); } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { struct journal_buf *buf = journal_cur_buf(j); @@ -1698,31 +1699,32 @@ static CLOSURE_CALLBACK(journal_write_done) static void journal_write_endio(struct bio *bio) { - struct bch_dev *ca = bio->bi_private; + struct journal_bio *jbio = container_of(bio, struct journal_bio, bio); + struct bch_dev *ca = jbio->ca; struct journal *j = &ca->fs->journal; - struct journal_buf *w = journal_last_unwritten_buf(j); - unsigned long flags; + struct journal_buf *w = j->buf + jbio->buf_idx; if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, "error writing journal entry %llu: %s", le64_to_cpu(w->data->seq), bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("journal")) { + unsigned long flags; + spin_lock_irqsave(&j->err_lock, flags); bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); spin_unlock_irqrestore(&j->err_lock, flags); } - closure_put(&j->io); + closure_put(&w->io); percpu_ref_put(&ca->io_ref); } static CLOSURE_CALLBACK(do_journal_write) { - closure_type(j, struct journal, io); + closure_type(w, struct journal_buf, io); + struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - unsigned buf_idx = journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK; - struct journal_buf *w = j->buf + buf_idx; unsigned sectors = vstruct_sectors(w->data, c->block_bits); extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { @@ -1738,7 +1740,7 @@ static CLOSURE_CALLBACK(do_journal_write) this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], sectors); - struct bio *bio = ja->bio[buf_idx]; + struct bio *bio = &ja->bio[w->idx]->bio; bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = ptr->offset; bio->bi_end_io = journal_write_endio; @@ -1937,10 +1939,9 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * CLOSURE_CALLBACK(bch2_journal_write) { - closure_type(j, struct journal, io); + closure_type(w, struct journal_buf, io); + struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - unsigned buf_idx = journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK; - struct journal_buf *w = j->buf + buf_idx; struct bch_replicas_padded replicas; struct printbuf journal_debug_buf = PRINTBUF; unsigned nr_rw_members = 0; @@ -2019,12 +2020,15 @@ CLOSURE_CALLBACK(bch2_journal_write) if (ret) goto err; + if (!JSET_NO_FLUSH(w->data)) + closure_wait_event(&j->async_wait, j->seq_ondisk + 1 == le64_to_cpu(w->data->seq)); + if (!JSET_NO_FLUSH(w->data) && w->separate_flush) { for_each_rw_member(c, ca) { percpu_ref_get(&ca->io_ref); struct journal_device *ja = &ca->journal; - struct bio *bio = ja->bio[buf_idx]; + struct bio *bio = &ja->bio[w->idx]->bio; bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH); bio->bi_end_io = journal_write_endio; diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 79db1daa1de2..d75fbd3b1d34 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -18,6 +18,7 @@ * the journal that are being staged or in flight. */ struct journal_buf { + struct closure io; struct jset *data; __BKEY_PADDED(key, BCH_REPLICAS_MAX); @@ -37,6 +38,7 @@ struct journal_buf { bool must_flush; /* something wants a flush */ bool separate_flush; bool need_flush_to_write_buffer; + u8 idx; }; /* @@ -150,6 +152,13 @@ enum journal_errors { typedef DARRAY(u64) darray_u64; +struct journal_bio { + struct bch_dev *ca; + unsigned buf_idx; + + struct bio bio; +}; + /* Embedded in struct bch_fs */ struct journal { /* Fastpath stuff up front: */ @@ -204,7 +213,6 @@ struct journal { wait_queue_head_t wait; struct closure_waitlist async_wait; - struct closure io; struct delayed_work write_work; struct workqueue_struct *wq; @@ -315,7 +323,7 @@ struct journal_device { u64 *buckets; /* Bio for journal reads/writes to this device */ - struct bio *bio[JOURNAL_BUF_NR]; + struct journal_bio *bio[JOURNAL_BUF_NR]; /* for bch_journal_read_device */ struct closure read; -- cgit v1.2.3 From 916abefd437be89383e720f19a43f727f045a468 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 31 Jan 2024 14:26:15 -0500 Subject: bcachefs: better journal pipelining Recently a severe performance regression was discovered, which bisected to a6548c8b5eb5 bcachefs: Avoid flushing the journal in the discard path It turns out the old behaviour, which issued excessive journal flushes, worked around a performance issue where queueing delays would cause the journal to not be able to write quickly enough and stall. The journal flushes masked the issue because they periodically flushed the device write cache, reducing write latency for non flushes. This patch reworks the journalling code to allow more than one (non-flush) write to be in flight at a time. With this patch, doing 4k random writes and an iodepth of 128, we are now able to hit 560k iops to a Samsung 970 EVO Plus - previously, we were stuck in the ~200k range. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 47 ++++++++++++++++++----- fs/bcachefs/journal.h | 7 ++-- fs/bcachefs/journal_io.c | 92 ++++++++++++++++++++++++--------------------- fs/bcachefs/journal_types.h | 11 ++++-- 4 files changed, 98 insertions(+), 59 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 963933930366..fe5f7a944ad3 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -74,6 +74,13 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 prt_printf(out, "%li jiffies", buf->expires - jiffies); prt_newline(out); + if (buf->write_done) + prt_printf(out, "write done\n"); + else if (buf->write_allocated) + prt_printf(out, "write allocated\n"); + else if (buf->write_started) + prt_printf(out, "write started\n"); + printbuf_indent_sub(out, 2); } @@ -175,21 +182,40 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) return stuck; } +void bch2_journal_do_writes(struct journal *j) +{ + for (u64 seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j); + seq++) { + unsigned idx = seq & JOURNAL_BUF_MASK; + struct journal_buf *w = j->buf + idx; + + if (w->write_started && !w->write_allocated) + break; + if (w->write_started) + continue; + + if (!journal_state_count(j->reservations, idx)) { + w->write_started = true; + closure_call(&w->io, bch2_journal_write, j->wq, NULL); + } + + break; + } +} + /* * Final processing when the last reference of a journal buffer has been * dropped. Drop the pin list reference acquired at journal entry open and write * the buffer, if requested. */ -void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write) +void bch2_journal_buf_put_final(struct journal *j, u64 seq) { lockdep_assert_held(&j->lock); if (__bch2_journal_pin_put(j, seq)) bch2_journal_reclaim_fast(j); - if (write) { - struct journal_buf *w = j->buf + (seq & JOURNAL_BUF_MASK); - closure_call(&w->io, bch2_journal_write, j->wq, NULL); - } + bch2_journal_do_writes(j); } /* @@ -381,11 +407,14 @@ static int journal_entry_open(struct journal *j) BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); bkey_extent_init(&buf->key); - buf->noflush = false; - buf->must_flush = false; - buf->separate_flush = false; - buf->flush_time = 0; + buf->noflush = false; + buf->must_flush = false; + buf->separate_flush = false; + buf->flush_time = 0; buf->need_flush_to_write_buffer = true; + buf->write_started = false; + buf->write_allocated = false; + buf->write_done = false; memset(buf->data, 0, sizeof(*buf->data)); buf->data->seq = cpu_to_le64(journal_cur_seq(j)); diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 4544ce24bb8a..7c7528f839c5 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -264,7 +264,8 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u } bool bch2_journal_entry_close(struct journal *); -void bch2_journal_buf_put_final(struct journal *, u64, bool); +void bch2_journal_do_writes(struct journal *); +void bch2_journal_buf_put_final(struct journal *, u64); static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) { @@ -272,7 +273,7 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s s = journal_state_buf_put(j, idx); if (!journal_state_count(s, idx)) - bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx); + bch2_journal_buf_put_final(j, seq); } static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) @@ -282,7 +283,7 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq s = journal_state_buf_put(j, idx); if (!journal_state_count(s, idx)) { spin_lock(&j->lock); - bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx); + bch2_journal_buf_put_final(j, seq); spin_unlock(&j->lock); } } diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index d02e49956621..cd8921a2c0da 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1602,7 +1602,7 @@ static CLOSURE_CALLBACK(journal_write_done) struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_replicas_padded replicas; union journal_res_state old, new; - u64 v, seq; + u64 v, seq = le64_to_cpu(w->data->seq); int err = 0; bch2_time_stats_update(!JSET_NO_FLUSH(w->data) @@ -1622,64 +1622,69 @@ static CLOSURE_CALLBACK(journal_write_done) if (err) bch2_fatal_error(c); - spin_lock(&j->lock); - seq = le64_to_cpu(w->data->seq); + closure_debug_destroy(cl); + spin_lock(&j->lock); if (seq >= j->pin.front) journal_seq_pin(j, seq)->devs = w->devs_written; + if (err && (!j->err_seq || seq < j->err_seq)) + j->err_seq = seq; + w->write_done = true; + + bool completed = false; + + for (seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j); + seq++) { + w = j->buf + (seq & JOURNAL_BUF_MASK); + if (!w->write_done) + break; - if (!err) { - if (!JSET_NO_FLUSH(w->data)) { + if (!j->err_seq && !JSET_NO_FLUSH(w->data)) { j->flushed_seq_ondisk = seq; j->last_seq_ondisk = w->last_seq; bch2_do_discards(c); closure_wake_up(&c->freelist_wait); - bch2_reset_alloc_cursors(c); } - } else if (!j->err_seq || seq < j->err_seq) - j->err_seq = seq; - j->seq_ondisk = seq; + j->seq_ondisk = seq; - /* - * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard - * more buckets: - * - * Must come before signaling write completion, for - * bch2_fs_journal_stop(): - */ - if (j->watermark != BCH_WATERMARK_stripe) - journal_reclaim_kick(&c->journal); + /* + * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard + * more buckets: + * + * Must come before signaling write completion, for + * bch2_fs_journal_stop(): + */ + if (j->watermark != BCH_WATERMARK_stripe) + journal_reclaim_kick(&c->journal); - /* also must come before signalling write completion: */ - closure_debug_destroy(cl); + v = atomic64_read(&j->reservations.counter); + do { + old.v = new.v = v; + BUG_ON(journal_state_count(new, new.unwritten_idx)); + BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK)); - v = atomic64_read(&j->reservations.counter); - do { - old.v = new.v = v; - BUG_ON(journal_state_count(new, new.unwritten_idx)); + new.unwritten_idx++; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - new.unwritten_idx++; - } while ((v = atomic64_cmpxchg(&j->reservations.counter, - old.v, new.v)) != old.v); + completed = true; + } - bch2_journal_reclaim_fast(j); - bch2_journal_space_available(j); + if (completed) { + bch2_journal_reclaim_fast(j); + bch2_journal_space_available(j); - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], - &j->max_in_flight_start, false); + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], + &j->max_in_flight_start, false); - closure_wake_up(&w->wait); - journal_wake(j); + closure_wake_up(&w->wait); + journal_wake(j); + } - if (!journal_state_count(new, new.unwritten_idx) && - journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { - struct journal_buf *w = j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); - spin_unlock(&j->lock); - closure_call(&w->io, bch2_journal_write, j->wq, NULL); - } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && + if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { struct journal_buf *buf = journal_cur_buf(j); long delta = buf->expires - jiffies; @@ -1689,12 +1694,10 @@ static CLOSURE_CALLBACK(journal_write_done) * previous entries still in flight - the current journal entry * might want to be written now: */ - - spin_unlock(&j->lock); mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); - } else { - spin_unlock(&j->lock); } + + spin_unlock(&j->lock); } static void journal_write_endio(struct bio *bio) @@ -1948,6 +1951,7 @@ CLOSURE_CALLBACK(bch2_journal_write) int ret; BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); + BUG_ON(w->write_allocated); j->write_start_time = local_clock(); @@ -1991,12 +1995,14 @@ CLOSURE_CALLBACK(bch2_journal_write) * bch2_journal_space_available(): */ w->sectors = 0; + w->write_allocated = true; /* * journal entry has been compacted and allocated, recalculate space * available: */ bch2_journal_space_available(j); + bch2_journal_do_writes(j); spin_unlock(&j->lock); w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index d75fbd3b1d34..3696aac3ccb7 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -34,10 +34,13 @@ struct journal_buf { unsigned disk_sectors; /* maximum size entry could have been, if buf_size was bigger */ unsigned u64s_reserved; - bool noflush; /* write has already been kicked off, and was noflush */ - bool must_flush; /* something wants a flush */ - bool separate_flush; - bool need_flush_to_write_buffer; + bool noflush:1; /* write has already been kicked off, and was noflush */ + bool must_flush:1; /* something wants a flush */ + bool separate_flush:1; + bool need_flush_to_write_buffer:1; + bool write_started:1; + bool write_allocated:1; + bool write_done:1; u8 idx; }; -- cgit v1.2.3 From fc634d8e46ec1dcbecb0ce6f84dd3e8b0c6e9330 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 22 Jan 2024 14:37:42 -0500 Subject: bcachefs: btree_and_journal_iter.trans we now always have a btree_trans when using a btree_and_journal_iter; prep work for adding prefetching to btree_and_journal_iter Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 8 ++++---- fs/bcachefs/btree_iter.c | 2 +- fs/bcachefs/btree_journal_iter.c | 15 ++++++++------- fs/bcachefs/btree_journal_iter.h | 13 ++++++++----- 4 files changed, 21 insertions(+), 17 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 829da0f92f0d..2caf9f5e0c35 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -389,7 +389,7 @@ again: have_child = dropped_children = false; bch2_bkey_buf_init(&prev_k); bch2_bkey_buf_init(&cur_k); - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { BUG_ON(bpos_lt(k.k->p, b->data->min_key)); @@ -478,7 +478,7 @@ again: goto err; bch2_btree_and_journal_iter_exit(&iter); - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { bch2_bkey_buf_reassemble(&cur_k, c, k); @@ -931,7 +931,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b struct printbuf buf = PRINTBUF; int ret = 0; - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); bch2_bkey_buf_init(&prev); bch2_bkey_buf_init(&cur); bkey_init(&prev.k->k); @@ -963,7 +963,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b if (b->c.level > target_depth) { bch2_btree_and_journal_iter_exit(&iter); - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { struct btree *child; diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index e6e7649dec79..8f185a98f96f 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -891,7 +891,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, struct bkey_s_c k; int ret = 0; - __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos); + __bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos); k = bch2_btree_and_journal_iter_peek(&jiter); diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 719a94a84950..fa907293ba43 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -376,17 +376,18 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) bch2_journal_iter_exit(&iter->journal); } -void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, - struct bch_fs *c, +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, + struct btree_and_journal_iter *iter, struct btree *b, struct btree_node_iter node_iter, struct bpos pos) { memset(iter, 0, sizeof(*iter)); + iter->trans = trans; iter->b = b; iter->node_iter = node_iter; - bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); + bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos); INIT_LIST_HEAD(&iter->journal.list); iter->pos = b->data->min_key; iter->at_end = false; @@ -396,15 +397,15 @@ void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter * this version is used by btree_gc before filesystem has gone RW and * multithreaded, so uses the journal_iters list: */ -void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, - struct bch_fs *c, +void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, + struct btree_and_journal_iter *iter, struct btree *b) { struct btree_node_iter node_iter; bch2_btree_node_iter_init_from_start(&node_iter, b); - __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key); - list_add(&iter->journal.list, &c->journal_iters); + __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key); + list_add(&iter->journal.list, &trans->c->journal_iters); } /* sort and dedup all keys in the journal: */ diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index 8ca4c100b2e3..1793cf89148b 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -15,6 +15,7 @@ struct journal_iter { */ struct btree_and_journal_iter { + struct btree_trans *trans; struct btree *b; struct btree_node_iter node_iter; struct bkey unpacked; @@ -29,6 +30,9 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, unsigned, struct bpos); +int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *, + struct btree_and_journal_iter *); + int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, unsigned, struct bkey_i *); int bch2_journal_key_insert(struct bch_fs *, enum btree_id, @@ -42,12 +46,11 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); -void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, - struct bch_fs *, struct btree *, +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *, + struct btree_and_journal_iter *, struct btree *, struct btree_node_iter, struct bpos); -void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, - struct bch_fs *, - struct btree *); +void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *, + struct btree_and_journal_iter *, struct btree *); void bch2_journal_keys_put(struct bch_fs *); -- cgit v1.2.3 From 5f43b0134e40bd798ba0999a11e90f24a0c65a51 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 22 Jan 2024 14:25:00 -0500 Subject: bcachefs: btree node prefetching in check_topology btree_and_journal_iter is old code that we want to get rid of, but we're not ready to yet. lack of btree node prefetching is, it turns out, a real performance issue for fsck on spinning rust, so - add it. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 10 +++++++--- fs/bcachefs/btree_gc.c | 3 +++ fs/bcachefs/btree_journal_iter.c | 31 +++++++++++++++++++++++++++++++ fs/bcachefs/btree_journal_iter.h | 1 + 4 files changed, 42 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index d7c81beac14a..a8b393bc7567 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -711,6 +711,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, b = bch2_btree_node_mem_alloc(trans, level != 0); if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { + if (!path) + return b; + trans->memory_allocation_failure = true; trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); @@ -760,8 +763,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, } if (!six_relock_type(&b->c.lock, lock_type, seq)) { - if (path) - trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); + BUG_ON(!path); + + trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); } @@ -1096,7 +1100,7 @@ int bch2_btree_node_prefetch(struct btree_trans *trans, struct btree_cache *bc = &c->btree_cache; struct btree *b; - BUG_ON(trans && !btree_node_locked(path, level + 1)); + BUG_ON(path && !btree_node_locked(path, level + 1)); BUG_ON(level >= BTREE_MAX_DEPTH); b = btree_cache_find(bc, k); diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 2caf9f5e0c35..3005b39d3a1a 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -390,6 +390,7 @@ again: bch2_bkey_buf_init(&prev_k); bch2_bkey_buf_init(&cur_k); bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); + iter.prefetch = true; while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { BUG_ON(bpos_lt(k.k->p, b->data->min_key)); @@ -479,6 +480,7 @@ again: bch2_btree_and_journal_iter_exit(&iter); bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); + iter.prefetch = true; while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { bch2_bkey_buf_reassemble(&cur_k, c, k); @@ -964,6 +966,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b if (b->c.level > target_depth) { bch2_btree_and_journal_iter_exit(&iter); bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); + iter.prefetch = true; while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { struct btree *child; diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index fa907293ba43..b7ac93c8fdd8 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_buf.h" #include "bset.h" +#include "btree_cache.h" #include "btree_journal_iter.h" #include "journal_io.h" @@ -334,9 +336,38 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) iter->pos = bpos_successor(iter->pos); } +static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter) +{ + struct btree_and_journal_iter iter = *_iter; + struct bch_fs *c = iter.trans->c; + unsigned level = iter.journal.level; + struct bkey_buf tmp; + unsigned nr = test_bit(BCH_FS_started, &c->flags) + ? (level > 1 ? 0 : 2) + : (level > 1 ? 1 : 16); + + iter.prefetch = false; + bch2_bkey_buf_init(&tmp); + + while (nr--) { + bch2_btree_and_journal_iter_advance(&iter); + struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter); + if (!k.k) + break; + + bch2_bkey_buf_reassemble(&tmp, c, k); + bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1); + } + + bch2_bkey_buf_exit(&tmp, c); +} + struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) { struct bkey_s_c btree_k, journal_k, ret; + + if (iter->prefetch && iter->journal.level) + btree_and_journal_iter_prefetch(iter); again: if (iter->at_end) return bkey_s_c_null; diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index 1793cf89148b..c9d19da3ea04 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -23,6 +23,7 @@ struct btree_and_journal_iter { struct journal_iter journal; struct bpos pos; bool at_end; + bool prefetch; }; struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, -- cgit v1.2.3 From 7f76b08acac4b41bd4bf1670470dd0770d61d0b9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 21 Jan 2024 16:46:45 -0500 Subject: bcachefs: Subvolumes may now be renamed Files within a subvolume cannot be renamed into another subvolume, but subvolumes themselves were intended to be. This implements subvolume renaming - we need to ensure that there's only a single dirent that points to a subvolume key (not multiple versions in different snapshots), and we need to ensure that dirent.d_parent_subol and inode.bi_parent_subvol are updated. Signed-off-by: Kent Overstreet --- fs/bcachefs/dirent.c | 66 ++++++++++++++++++++++++++++++------------------- fs/bcachefs/fs-common.c | 15 +++++++++++ 2 files changed, 55 insertions(+), 26 deletions(-) diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 8a4572594545..0dfcb194cb1e 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -293,12 +293,10 @@ int bch2_dirent_rename(struct btree_trans *trans, struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; struct bpos dst_pos = POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name)); - unsigned src_type = 0, dst_type = 0, src_update_flags = 0; + unsigned src_update_flags = 0; + bool delete_src, delete_dst; int ret = 0; - if (src_dir.subvol != dst_dir.subvol) - return -EXDEV; - memset(src_inum, 0, sizeof(*src_inum)); memset(dst_inum, 0, sizeof(*dst_inum)); @@ -319,12 +317,6 @@ int bch2_dirent_rename(struct btree_trans *trans, if (ret) goto out; - src_type = bkey_s_c_to_dirent(old_src).v->d_type; - - if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE) - return -EOPNOTSUPP; - - /* Lookup dst: */ if (mode == BCH_RENAME) { /* @@ -352,11 +344,6 @@ int bch2_dirent_rename(struct btree_trans *trans, bkey_s_c_to_dirent(old_dst), dst_inum); if (ret) goto out; - - dst_type = bkey_s_c_to_dirent(old_dst).v->d_type; - - if (dst_type == DT_SUBVOL) - return -EOPNOTSUPP; } if (mode != BCH_RENAME_EXCHANGE) @@ -426,28 +413,55 @@ int bch2_dirent_rename(struct btree_trans *trans, } } + if (new_dst->v.d_type == DT_SUBVOL) + new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol); + + if ((mode == BCH_RENAME_EXCHANGE) && + new_src->v.d_type == DT_SUBVOL) + new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol); + ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); if (ret) goto out; out_set_src: - /* - * If we're deleting a subvolume, we need to really delete the dirent, - * not just emit a whiteout in the current snapshot: + * If we're deleting a subvolume we need to really delete the dirent, + * not just emit a whiteout in the current snapshot - there can only be + * single dirent that points to a given subvolume. + * + * IOW, we don't maintain multiple versions in different snapshots of + * dirents that point to subvolumes - dirents that point to subvolumes + * are only visible in one particular subvolume so it's not necessary, + * and it would be particularly confusing for fsck to have to deal with. */ - if (src_type == DT_SUBVOL) { - bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); - ret = bch2_btree_iter_traverse(&src_iter); + delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL && + new_src->k.p.snapshot != old_src.k->p.snapshot; + + delete_dst = old_dst.k && + bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL && + new_dst->k.p.snapshot != old_dst.k->p.snapshot; + + if (!delete_src || !bkey_deleted(&new_src->k)) { + ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); if (ret) goto out; + } - new_src->k.p = src_iter.pos; - src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE; + if (delete_src) { + bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); + ret = bch2_btree_iter_traverse(&src_iter) ?: + bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + if (ret) + goto out; } - ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); - if (ret) - goto out; + if (delete_dst) { + bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot); + ret = bch2_btree_iter_traverse(&dst_iter) ?: + bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + if (ret) + goto out; + } if (mode == BCH_RENAME_EXCHANGE) *src_offset = new_src->k.p.offset; diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index 8ee716e4c2e7..523507e38887 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -410,6 +410,21 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; } + /* Can't move across subvolumes, unless it's a subvolume root: */ + if (src_dir.subvol != dst_dir.subvol && + (!src_inode_u->bi_subvol || + (dst_inum.inum && !dst_inode_u->bi_subvol))) { + ret = -EXDEV; + goto err; + } + + if (src_inode_u->bi_parent_subvol) + src_inode_u->bi_parent_subvol = dst_dir.subvol; + + if ((mode == BCH_RENAME_EXCHANGE) && + dst_inode_u->bi_parent_subvol) + dst_inode_u->bi_parent_subvol = src_dir.subvol; + src_inode_u->bi_dir = dst_dir_u->bi_inum; src_inode_u->bi_dir_offset = dst_offset; -- cgit v1.2.3 From f8f8fb443b3cfc362cfa57c105f78537e666a48a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 3 Feb 2024 15:05:17 -0500 Subject: bcachefs: Switch to uuid_to_fsid() switch the statfs code from something horrible and open coded to the more standard uuid_to_fsid() Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 77ae65542db9..ec9cf4b8faf1 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1572,7 +1572,6 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) * number: */ u64 avail_inodes = ((usage.capacity - usage.used) << 3); - u64 fsid; buf->f_type = BCACHEFS_STATFS_MAGIC; buf->f_bsize = sb->s_blocksize; @@ -1583,10 +1582,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = usage.nr_inodes + avail_inodes; buf->f_ffree = avail_inodes; - fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ - le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); - buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; - buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b); buf->f_namelen = BCH_NAME_MAX; return 0; -- cgit v1.2.3 From 29223b5a555e90844fda741088831d06bb52b882 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 3 Feb 2024 15:23:07 -0500 Subject: bcachefs: Initialize super_block->s_uuid Need to fix this oversight for the new FS_IOC_(GET|SET)UUID ioctls. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index ec9cf4b8faf1..be0c059d70f0 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1878,6 +1878,7 @@ got_sb: sb->s_time_gran = c->sb.nsec_per_time_unit; sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); + sb->s_uuid = c->sb.user_uuid; c->vfs_sb = sb; strscpy(sb->s_id, c->name, sizeof(sb->s_id)); -- cgit v1.2.3 From 69c8e6ce022fd87abb9c8dbbdcfd312d6513b055 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 1 Feb 2024 07:35:28 -0500 Subject: bcachefs: move fsck_write_inode() to inode.c Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 53 +++++++++++++---------------------------------------- fs/bcachefs/inode.c | 28 ++++++++++++++++++++++++++++ fs/bcachefs/inode.h | 3 +++ 3 files changed, 44 insertions(+), 40 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index e92efc944823..76d7144aae7e 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -142,34 +142,6 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans, return 0; } -static int __write_inode(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - u32 snapshot) -{ - struct bkey_inode_buf *inode_p = - bch2_trans_kmalloc(trans, sizeof(*inode_p)); - - if (IS_ERR(inode_p)) - return PTR_ERR(inode_p); - - bch2_inode_pack(inode_p, inode); - inode_p->inode.k.p.snapshot = snapshot; - - return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, - &inode_p->inode.k_i, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -} - -static int fsck_write_inode(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - u32 snapshot) -{ - int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __write_inode(trans, inode, snapshot)); - bch_err_fn(trans->c, ret); - return ret; -} - static int __remove_dirent(struct btree_trans *trans, struct bpos pos) { struct bch_fs *c = trans->c; @@ -312,7 +284,7 @@ static int reattach_inode(struct btree_trans *trans, if (S_ISDIR(inode->bi_mode)) { lostfound.bi_nlink++; - ret = __write_inode(trans, &lostfound, U32_MAX); + ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX); if (ret) return ret; } @@ -334,7 +306,7 @@ static int reattach_inode(struct btree_trans *trans, inode->bi_dir = lostfound.bi_inum; inode->bi_dir_offset = dir_offset; - return __write_inode(trans, inode, inode_snapshot); + return __bch2_fsck_write_inode(trans, inode, inode_snapshot); } static int remove_backpointer(struct btree_trans *trans, @@ -858,7 +830,8 @@ static int check_inode(struct btree_trans *trans, u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked; - ret = __write_inode(trans, &u, iter->pos.snapshot); + ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot); + bch_err_msg(c, ret, "in fsck updating inode"); if (ret) return ret; @@ -948,7 +921,7 @@ static int check_inode(struct btree_trans *trans, } if (do_update) { - ret = __write_inode(trans, &u, iter->pos.snapshot); + ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot); bch_err_msg(c, ret, "in fsck updating inode"); if (ret) return ret; @@ -1029,7 +1002,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) w->last_pos.inode, i->snapshot, i->inode.bi_sectors, i->count)) { i->inode.bi_sectors = i->count; - ret = fsck_write_inode(trans, &i->inode, i->snapshot); + ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot); if (ret) break; } @@ -1478,7 +1451,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) "directory %llu:%u with wrong i_nlink: got %u, should be %llu", w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { i->inode.bi_nlink = i->count; - ret = fsck_write_inode(trans, &i->inode, i->snapshot); + ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot); if (ret) break; } @@ -1505,7 +1478,7 @@ static int check_dirent_target(struct btree_trans *trans, target->bi_dir = d.k->p.inode; target->bi_dir_offset = d.k->p.offset; - ret = __write_inode(trans, target, target_snapshot); + ret = __bch2_fsck_write_inode(trans, target, target_snapshot); if (ret) goto err; } @@ -1545,7 +1518,7 @@ static int check_dirent_target(struct btree_trans *trans, target->bi_nlink++; target->bi_flags &= ~BCH_INODE_unlinked; - ret = __write_inode(trans, target, target_snapshot); + ret = __bch2_fsck_write_inode(trans, target, target_snapshot); if (ret) goto err; } @@ -1563,7 +1536,7 @@ static int check_dirent_target(struct btree_trans *trans, target->bi_dir = d.k->p.inode; target->bi_dir_offset = d.k->p.offset; - ret = __write_inode(trans, target, target_snapshot); + ret = __bch2_fsck_write_inode(trans, target, target_snapshot); if (ret) goto err; } @@ -1741,7 +1714,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, target_inum, subvol_root.bi_subvol, target_subvol)) { subvol_root.bi_subvol = target_subvol; - ret = __write_inode(trans, &subvol_root, target_snapshot); + ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot); if (ret) goto err; } @@ -1915,7 +1888,7 @@ static int check_root_trans(struct btree_trans *trans) 0, NULL); root_inode.bi_inum = inum; - ret = __write_inode(trans, &root_inode, snapshot); + ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot); bch_err_msg(c, ret, "writing root inode"); } err: @@ -2287,7 +2260,7 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], bch2_inode_nlink_get(&u), link->count)) { bch2_inode_nlink_set(&u, link->count); - ret = __write_inode(trans, &u, k.k->p.snapshot); + ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot); } fsck_err: return ret; diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index b07ab98e460e..ee298a47425f 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -384,6 +384,34 @@ int bch2_inode_write_flags(struct btree_trans *trans, return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags); } +int __bch2_fsck_write_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 snapshot) +{ + struct bkey_inode_buf *inode_p = + bch2_trans_kmalloc(trans, sizeof(*inode_p)); + + if (IS_ERR(inode_p)) + return PTR_ERR(inode_p); + + bch2_inode_pack(inode_p, inode); + inode_p->inode.k.p.snapshot = snapshot; + + return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, + &inode_p->inode.k_i, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +} + +int bch2_fsck_write_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 snapshot) +{ + int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + __bch2_fsck_write_inode(trans, inode, snapshot)); + bch_err_fn(trans->c, ret); + return ret; +} + struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) { struct bch_inode_unpacked u; diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index b63f312581cf..b8da7ff8069d 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -108,6 +108,9 @@ static inline int bch2_inode_write(struct btree_trans *trans, return bch2_inode_write_flags(trans, iter, inode, 0); } +int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32); +int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32); + void bch2_inode_init_early(struct bch_fs *, struct bch_inode_unpacked *); void bch2_inode_init_late(struct bch_inode_unpacked *, u64, -- cgit v1.2.3 From 82fdc1dc9831edee465ac4d07323003f8c2bcfa8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 5 Feb 2024 19:28:03 -0500 Subject: bcachefs: bump max_active on btree_interior_update_worker WQ_UNBOUND with max_active 1 means ordered workqueue, but we don't actually need or want ordered semantics - and probably want a higher concurrency limit anyways. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 7203ea8d5026..8b0291cfa872 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -2534,7 +2534,7 @@ void bch2_fs_btree_interior_update_init_early(struct bch_fs *c) int bch2_fs_btree_interior_update_init(struct bch_fs *c) { c->btree_interior_update_worker = - alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1); + alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8); if (!c->btree_interior_update_worker) return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; -- cgit v1.2.3 From ce3e9283de18a1eb77e0f8009fc7c948677462bf Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 5 Feb 2024 21:44:23 -0500 Subject: bcachefs: Kill some -EINVALs Repurposing standard error codes in bcachefs code is banned in new code, and we need to get rid of the remaining ones - private error codes give us much better error messages. Signed-off-by: Kent Overstreet --- fs/bcachefs/errcode.h | 2 ++ fs/bcachefs/migrate.c | 8 +++----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 8c40c2067a04..3fd33b307a77 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -176,6 +176,8 @@ x(EINVAL, invalid) \ x(EINVAL, internal_fsck_err) \ x(EINVAL, opt_parse_error) \ + x(EINVAL, remove_with_metadata_missing_unimplemented)\ + x(EINVAL, remove_would_lose_data) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ x(EROFS, erofs_journal_err) \ diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index 5623cee3ef86..69098eeb5d48 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -31,7 +31,7 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, nr_good = bch2_bkey_durability(c, k.s_c); if ((!nr_good && !(flags & lost)) || (nr_good < replicas && !(flags & degraded))) - return -EINVAL; + return -BCH_ERR_remove_would_lose_data; return 0; } @@ -111,7 +111,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) /* don't handle this yet: */ if (flags & BCH_FORCE_IF_METADATA_LOST) - return -EINVAL; + return -BCH_ERR_remove_with_metadata_missing_unimplemented; trans = bch2_trans_get(c); bch2_bkey_buf_init(&k); @@ -132,10 +132,8 @@ retry: ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true); - if (ret) { - bch_err(c, "Cannot drop device without losing data"); + if (ret) break; - } ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { -- cgit v1.2.3 From 11def1888f26e7ec373e85d77c5b425632ef175f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 5 Feb 2024 03:22:29 -0500 Subject: bcachefs: Factor out check_subvol_dirent() Going to be adding more code here for checking subvol structure. Signed-off-by: Kent Overstreet --- MAINTAINERS | 1 + fs/bcachefs/fsck.c | 105 +++++++++++++++++++++++++++++------------------------ 2 files changed, 58 insertions(+), 48 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 2ecaaec6a6bf..c441ded79431 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3536,6 +3536,7 @@ R: Brian Foster L: linux-bcachefs@vger.kernel.org S: Supported C: irc://irc.oftc.net/bcache +T: git https://evilpiepirate.org/git/bcachefs.git F: fs/bcachefs/ BDISP ST MEDIA DRIVER diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 76d7144aae7e..f05896dcb4bd 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1593,6 +1593,58 @@ fsck_err: return ret; } +static int check_subvol_dirent(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c_dirent d) +{ + struct bch_fs *c = trans->c; + struct bch_inode_unpacked subvol_root; + u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); + u32 target_snapshot; + u64 target_inum; + int ret = 0; + + ret = subvol_lookup(trans, target_subvol, + &target_snapshot, &target_inum); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (fsck_err_on(ret, c, dirent_to_missing_subvol, + "dirent points to missing subvolume %u", + le32_to_cpu(d.v->d_child_subvol))) + return __remove_dirent(trans, d.k->p); + + ret = lookup_inode(trans, target_inum, + &subvol_root, &target_snapshot); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (fsck_err_on(ret, c, subvol_to_missing_root, + "subvolume %u points to missing subvolume root %llu", + target_subvol, + target_inum)) { + bch_err(c, "repair not implemented yet"); + return -EINVAL; + } + + if (fsck_err_on(subvol_root.bi_subvol != target_subvol, + c, subvol_root_wrong_bi_subvol, + "subvol root %llu has wrong bi_subvol field: got %u, should be %u", + target_inum, + subvol_root.bi_subvol, target_subvol)) { + subvol_root.bi_subvol = target_subvol; + ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot); + if (ret) + return ret; + } + + ret = check_dirent_target(trans, iter, d, &subvol_root, + target_snapshot); + if (ret) + return ret; +fsck_err: + return ret; +} + static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, struct bch_hash_info *hash_info, @@ -1677,50 +1729,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, d = bkey_s_c_to_dirent(k); if (d.v->d_type == DT_SUBVOL) { - struct bch_inode_unpacked subvol_root; - u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); - u32 target_snapshot; - u64 target_inum; - - ret = subvol_lookup(trans, target_subvol, - &target_snapshot, &target_inum); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (fsck_err_on(ret, c, dirent_to_missing_subvol, - "dirent points to missing subvolume %u", - le32_to_cpu(d.v->d_child_subvol))) { - ret = __remove_dirent(trans, d.k->p); - goto err; - } - - ret = lookup_inode(trans, target_inum, - &subvol_root, &target_snapshot); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (fsck_err_on(ret, c, subvol_to_missing_root, - "subvolume %u points to missing subvolume root %llu", - target_subvol, - target_inum)) { - bch_err(c, "repair not implemented yet"); - ret = -EINVAL; - goto err; - } - - if (fsck_err_on(subvol_root.bi_subvol != target_subvol, - c, subvol_root_wrong_bi_subvol, - "subvol root %llu has wrong bi_subvol field: got %u, should be %u", - target_inum, - subvol_root.bi_subvol, target_subvol)) { - subvol_root.bi_subvol = target_subvol; - ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot); - if (ret) - goto err; - } - - ret = check_dirent_target(trans, iter, d, &subvol_root, - target_snapshot); + ret = check_subvol_dirent(trans, iter, d); if (ret) goto err; } else { @@ -1746,11 +1755,11 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret) goto err; } - } - if (d.v->d_type == DT_DIR) - for_each_visible_inode(c, s, dir, equiv.snapshot, i) - i->count++; + if (d.v->d_type == DT_DIR) + for_each_visible_inode(c, s, dir, equiv.snapshot, i) + i->count++; + } out: err: fsck_err: -- cgit v1.2.3 From 3d4998c202978e5389b660cdfa33299d531b3f4f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 5 Feb 2024 19:38:19 -0500 Subject: bcachefs: factor out check_inode_backpointer() Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index f05896dcb4bd..24f8657370ce 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1461,16 +1461,15 @@ fsck_err: return ret ?: trans_was_restarted(trans, restart_count); } -static int check_dirent_target(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target, - u32 target_snapshot) +static int check_inode_backpointer(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + u32 target_snapshot) { struct bch_fs *c = trans->c; - struct bkey_i_dirent *n; - struct printbuf buf = PRINTBUF; struct btree_iter bp_iter = { NULL }; + struct printbuf buf = PRINTBUF; int ret = 0; if (!target->bi_dir && @@ -1541,6 +1540,29 @@ static int check_dirent_target(struct btree_trans *trans, goto err; } } +out: +err: +fsck_err: + bch2_trans_iter_exit(trans, &bp_iter); + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} + +static int check_dirent_target(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + u32 target_snapshot) +{ + struct bch_fs *c = trans->c; + struct bkey_i_dirent *n; + struct printbuf buf = PRINTBUF; + int ret = 0; + + ret = check_inode_backpointer(trans, iter, d, target, target_snapshot); + if (ret) + goto err; if (fsck_err_on(d.v->d_type != inode_d_type(target), c, dirent_d_type_wrong, @@ -1584,10 +1606,8 @@ static int check_dirent_target(struct btree_trans *trans, d = dirent_i_to_s_c(n); } -out: err: fsck_err: - bch2_trans_iter_exit(trans, &bp_iter); printbuf_exit(&buf); bch_err_fn(c, ret); return ret; -- cgit v1.2.3 From 3f6d5e6a468d02676244b868b210433831846127 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 25 Jan 2024 19:00:24 -0500 Subject: mm: introduce memalloc_flags_{save,restore} Our proliferation of memalloc_*_{save,restore} APIs is getting a bit silly, this adds a generic version and converts the existing save/restore functions to wrappers. Signed-off-by: Kent Overstreet Cc: Vlastimil Babka Cc: Matthew Wilcox Cc: Michal Hocko Cc: Darrick J. Wong Cc: linux-mm@kvack.org Acked-by: Vlastimil Babka --- include/linux/sched/mm.h | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 9a19f1b42f64..f00d7ecc2adf 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -306,6 +306,24 @@ static inline void might_alloc(gfp_t gfp_mask) might_sleep_if(gfpflags_allow_blocking(gfp_mask)); } +/** + * memalloc_flags_save - Add a PF_* flag to current->flags, save old value + * + * This allows PF_* flags to be conveniently added, irrespective of current + * value, and then the old version restored with memalloc_flags_restore(). + */ +static inline unsigned memalloc_flags_save(unsigned flags) +{ + unsigned oldflags = ~current->flags & flags; + current->flags |= flags; + return oldflags; +} + +static inline void memalloc_flags_restore(unsigned flags) +{ + current->flags &= ~flags; +} + /** * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope. * @@ -319,9 +337,7 @@ static inline void might_alloc(gfp_t gfp_mask) */ static inline unsigned int memalloc_noio_save(void) { - unsigned int flags = current->flags & PF_MEMALLOC_NOIO; - current->flags |= PF_MEMALLOC_NOIO; - return flags; + return memalloc_flags_save(PF_MEMALLOC_NOIO); } /** @@ -334,7 +350,7 @@ static inline unsigned int memalloc_noio_save(void) */ static inline void memalloc_noio_restore(unsigned int flags) { - current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags; + memalloc_flags_restore(flags); } /** @@ -350,9 +366,7 @@ static inline void memalloc_noio_restore(unsigned int flags) */ static inline unsigned int memalloc_nofs_save(void) { - unsigned int flags = current->flags & PF_MEMALLOC_NOFS; - current->flags |= PF_MEMALLOC_NOFS; - return flags; + return memalloc_flags_save(PF_MEMALLOC_NOFS); } /** @@ -365,32 +379,27 @@ static inline unsigned int memalloc_nofs_save(void) */ static inline void memalloc_nofs_restore(unsigned int flags) { - current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags; + memalloc_flags_restore(flags); } static inline unsigned int memalloc_noreclaim_save(void) { - unsigned int flags = current->flags & PF_MEMALLOC; - current->flags |= PF_MEMALLOC; - return flags; + return memalloc_flags_save(PF_MEMALLOC); } static inline void memalloc_noreclaim_restore(unsigned int flags) { - current->flags = (current->flags & ~PF_MEMALLOC) | flags; + memalloc_flags_restore(flags); } static inline unsigned int memalloc_pin_save(void) { - unsigned int flags = current->flags & PF_MEMALLOC_PIN; - - current->flags |= PF_MEMALLOC_PIN; - return flags; + return memalloc_flags_save(PF_MEMALLOC_PIN); } static inline void memalloc_pin_restore(unsigned int flags) { - current->flags = (current->flags & ~PF_MEMALLOC_PIN) | flags; + memalloc_flags_restore(flags); } #ifdef CONFIG_MEMCG -- cgit v1.2.3 From eab0af905bfc3e9c05da2ca163d76a1513159aa4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 25 Jan 2024 19:00:24 -0500 Subject: mm: introduce PF_MEMALLOC_NORECLAIM, PF_MEMALLOC_NOWARN Introduce PF_MEMALLOC_* equivalents of some GFP_ flags: PF_MEMALLOC_NORECLAIM -> GFP_NOWAIT PF_MEMALLOC_NOWARN -> __GFP_NOWARN Cc: Vlastimil Babka Cc: Matthew Wilcox Cc: Michal Hocko Cc: Darrick J. Wong Cc: linux-mm@kvack.org Signed-off-by: Kent Overstreet --- include/linux/sched.h | 4 ++-- include/linux/sched/mm.h | 17 +++++++++++++---- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index ffe8f618ab86..192e2c892040 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1636,8 +1636,8 @@ extern struct pid *cad_pid; * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ -#define PF__HOLE__00800000 0x00800000 -#define PF__HOLE__01000000 0x01000000 +#define PF_MEMALLOC_NORECLAIM 0x00800000 /* All allocation requests will clear __GFP_DIRECT_RECLAIM */ +#define PF_MEMALLOC_NOWARN 0x01000000 /* All allocation requests will inherit __GFP_NOWARN */ #define PF__HOLE__02000000 0x02000000 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index f00d7ecc2adf..c29059a76052 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -236,16 +236,25 @@ static inline gfp_t current_gfp_context(gfp_t flags) { unsigned int pflags = READ_ONCE(current->flags); - if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) { + if (unlikely(pflags & (PF_MEMALLOC_NOIO | + PF_MEMALLOC_NOFS | + PF_MEMALLOC_NORECLAIM | + PF_MEMALLOC_NOWARN | + PF_MEMALLOC_PIN))) { /* - * NOIO implies both NOIO and NOFS and it is a weaker context - * so always make sure it makes precedence + * Stronger flags before weaker flags: + * NORECLAIM implies NOIO, which in turn implies NOFS */ - if (pflags & PF_MEMALLOC_NOIO) + if (pflags & PF_MEMALLOC_NORECLAIM) + flags &= ~__GFP_DIRECT_RECLAIM; + else if (pflags & PF_MEMALLOC_NOIO) flags &= ~(__GFP_IO | __GFP_FS); else if (pflags & PF_MEMALLOC_NOFS) flags &= ~__GFP_FS; + if (pflags & PF_MEMALLOC_NOWARN) + flags |= __GFP_NOWARN; + if (pflags & PF_MEMALLOC_PIN) flags &= ~__GFP_MOVABLE; } -- cgit v1.2.3 From a91bc5e50558fdc09a04c90484633df100d0d2bb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 25 Jan 2024 20:25:49 -0500 Subject: bcachefs: bch2_inode_insert() Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 138 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 76 insertions(+), 62 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index be0c059d70f0..bcc1436c1720 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -176,45 +176,88 @@ static unsigned bch2_inode_hash(subvol_inum inum) return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL); } -struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) +static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode) { - struct bch_inode_unpacked inode_u; - struct bch_inode_info *inode; - struct btree_trans *trans; - struct bch_subvolume subvol; - int ret; + subvol_inum inum = inode_inum(inode); + struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v, + bch2_inode_hash(inum), + bch2_iget5_test, + bch2_iget5_set, + &inum)); + BUG_ON(!old); - inode = to_bch_ei(iget5_locked(c->vfs_sb, - bch2_inode_hash(inum), - bch2_iget5_test, - bch2_iget5_set, - &inum)); - if (unlikely(!inode)) - return ERR_PTR(-ENOMEM); - if (!(inode->v.i_state & I_NEW)) - return &inode->v; + if (unlikely(old != inode)) { + discard_new_inode(&inode->v); + inode = old; + } else { + mutex_lock(&c->vfs_inodes_lock); + list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); + mutex_unlock(&c->vfs_inodes_lock); + /* + * we really don't want insert_inode_locked2() to be setting + * I_NEW... + */ + unlock_new_inode(&inode->v); + } - trans = bch2_trans_get(c); - ret = lockrestart_do(trans, - bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: - bch2_inode_find_by_inum_trans(trans, inum, &inode_u)); + return inode; +} - if (!ret) - bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); - bch2_trans_put(trans); +#define memalloc_flags_do(_flags, _do) \ +({ \ + unsigned _saved_flags = memalloc_flags_save(_flags); \ + typeof(_do) _ret = _do; \ + memalloc_noreclaim_restore(_saved_flags); \ + _ret; \ +}) - if (ret) { - iget_failed(&inode->v); - return ERR_PTR(bch2_err_class(ret)); +/* + * Allocate a new inode, dropping/retaking btree locks if necessary: + */ +static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + + struct bch_inode_info *inode = + memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN, + to_bch_ei(new_inode(c->vfs_sb))); + + if (unlikely(!inode)) { + int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM); + if (ret && inode) + discard_new_inode(&inode->v); + if (ret) + return ERR_PTR(ret); } - mutex_lock(&c->vfs_inodes_lock); - list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); - mutex_unlock(&c->vfs_inodes_lock); + return inode; +} - unlock_new_inode(&inode->v); +struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) +{ + struct bch_inode_info *inode = + to_bch_ei(ilookup5_nowait(c->vfs_sb, + bch2_inode_hash(inum), + bch2_iget5_test, + &inum)); + if (inode) + return &inode->v; - return &inode->v; + struct btree_trans *trans = bch2_trans_get(c); + + struct bch_inode_unpacked inode_u; + struct bch_subvolume subvol; + int ret = lockrestart_do(trans, + bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: + bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: + PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans)); + if (!ret) { + bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); + inode = bch2_inode_insert(c, inode); + } + bch2_trans_put(trans); + + return ret ? ERR_PTR(ret) : &inode->v; } struct bch_inode_info * @@ -226,7 +269,7 @@ __bch2_create(struct mnt_idmap *idmap, struct bch_fs *c = dir->v.i_sb->s_fs_info; struct btree_trans *trans; struct bch_inode_unpacked dir_u; - struct bch_inode_info *inode, *old; + struct bch_inode_info *inode; struct bch_inode_unpacked inode_u; struct posix_acl *default_acl = NULL, *acl = NULL; subvol_inum inum; @@ -293,7 +336,6 @@ err_before_quota: mutex_unlock(&dir->ei_update_lock); } - bch2_iget5_set(&inode->v, &inum); bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); @@ -304,36 +346,7 @@ err_before_quota: * bch2_trans_exit() and dropping locks, else we could race with another * thread pulling the inode in and modifying it: */ - - inode->v.i_state |= I_CREATING; - - old = to_bch_ei(inode_insert5(&inode->v, - bch2_inode_hash(inum), - bch2_iget5_test, - bch2_iget5_set, - &inum)); - BUG_ON(!old); - - if (unlikely(old != inode)) { - /* - * We raced, another process pulled the new inode into cache - * before us: - */ - make_bad_inode(&inode->v); - iput(&inode->v); - - inode = old; - } else { - mutex_lock(&c->vfs_inodes_lock); - list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); - mutex_unlock(&c->vfs_inodes_lock); - /* - * we really don't want insert_inode_locked2() to be setting - * I_NEW... - */ - unlock_new_inode(&inode->v); - } - + inode = bch2_inode_insert(c, inode); bch2_trans_put(trans); err: posix_acl_release(default_acl); @@ -1372,6 +1385,7 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *bi, struct bch_subvolume *subvol) { + bch2_iget5_set(&inode->v, &inum); bch2_inode_update_after_write(trans, inode, bi, ~0); if (BCH_SUBVOLUME_SNAP(subvol)) -- cgit v1.2.3 From 737cd174d1666620f2c41a4552623125de6bd80d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 25 Jan 2024 12:36:37 -0500 Subject: bcachefs: bch2_lookup() gives better error message on inode not found When a dirent points to a missing inode, we really should print out the dirent. This requires quite a bit of refactoring, but there's some other benefits: we now do the entire looup (dirent and inode) in a single btree transaction, and copy to the VFS inode with btree locks still held, like the create path. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 64 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index bcc1436c1720..093f5404a655 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -365,23 +365,78 @@ err_trans: /* methods */ +static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, + subvol_inum dir, struct bch_hash_info *dir_hash_info, + const struct qstr *name) +{ + struct bch_fs *c = trans->c; + struct btree_iter dirent_iter = {}; + subvol_inum inum = {}; + + int ret = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, + dir_hash_info, dir, name, 0); + if (ret) + return ERR_PTR(ret); + + struct bkey_s_c k = bch2_btree_iter_peek_slot(&dirent_iter); + ret = bkey_err(k); + if (ret) + goto err; + + ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum); + if (ret > 0) + ret = -ENOENT; + if (ret) + goto err; + + struct bch_inode_info *inode = + to_bch_ei(ilookup5_nowait(c->vfs_sb, + bch2_inode_hash(inum), + bch2_iget5_test, + &inum)); + if (inode) + goto out; + + struct bch_subvolume subvol; + struct bch_inode_unpacked inode_u; + ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: + bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: + PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans)); + if (bch2_err_matches(ret, ENOENT)) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + bch_err(c, "%s points to missing inode", buf.buf); + printbuf_exit(&buf); + } + if (ret) + goto err; + + bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); + inode = bch2_inode_insert(c, inode); +out: + bch2_trans_iter_exit(trans, &dirent_iter); + return inode; +err: + inode = ERR_PTR(ret); + goto out; +} + static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, unsigned int flags) { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir); struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); - struct inode *vinode = NULL; - subvol_inum inum = { .subvol = 1 }; - int ret; - - ret = bch2_dirent_lookup(c, inode_inum(dir), &hash, - &dentry->d_name, &inum); - if (!ret) - vinode = bch2_vfs_inode_get(c, inum); + struct bch_inode_info *inode; + bch2_trans_do(c, NULL, NULL, 0, + PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir), + &hash, &dentry->d_name))); + if (IS_ERR(inode)) + inode = NULL; - return d_splice_alias(vinode, dentry); + return d_splice_alias(&inode->v, dentry); } static int bch2_mknod(struct mnt_idmap *idmap, -- cgit v1.2.3 From 0225bdfafd818f895fa4a4512f124a1614e011e2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 1 Feb 2024 06:28:41 -0500 Subject: mempool: kvmalloc pool Add mempool_init_kvmalloc_pool() and mempool_create_kvmalloc_pool(), which wrap kvmalloc() instead of kmalloc() - kmalloc() with a vmalloc() fallback. This is part of a bcachefs cleanup - dropping an internal kvpmalloc() helper (which predates kvmalloc()) along with mempool helpers; this replaces the bcachefs-private kvpmalloc_pool. Signed-off-by: Kent Overstreet Cc: linux-mm@kvack.org --- include/linux/mempool.h | 13 +++++++++++++ mm/mempool.c | 13 +++++++++++++ 2 files changed, 26 insertions(+) diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 7be1e32e6d42..16c5cc807ff6 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -95,6 +95,19 @@ static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size) (void *) size); } +void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data); +void mempool_kvfree(void *element, void *pool_data); + +static inline int mempool_init_kvmalloc_pool(mempool_t *pool, int min_nr, size_t size) +{ + return mempool_init(pool, min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size); +} + +static inline mempool_t *mempool_create_kvmalloc_pool(int min_nr, size_t size) +{ + return mempool_create(min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size); +} + /* * A mempool_alloc_t and mempool_free_t for a simple page allocator that * allocates pages of the order specified by pool_data diff --git a/mm/mempool.c b/mm/mempool.c index dbbf0e9fb424..076c736f5f1f 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -590,6 +590,19 @@ void mempool_kfree(void *element, void *pool_data) } EXPORT_SYMBOL(mempool_kfree); +void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data) +{ + size_t size = (size_t)pool_data; + return kvmalloc(size, gfp_mask); +} +EXPORT_SYMBOL(mempool_kvmalloc); + +void mempool_kvfree(void *element, void *pool_data) +{ + kvfree(element); +} +EXPORT_SYMBOL(mempool_kvfree); + /* * A simple mempool-backed page allocator that allocates pages * of the order specified by pool_data. -- cgit v1.2.3 From cb6fc943b650c4f0ca2ba753531c0803c8afbb5c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 1 Feb 2024 06:35:46 -0500 Subject: bcachefs: kill kvpmalloc() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 8 ++++---- fs/bcachefs/btree_gc.c | 6 ++---- fs/bcachefs/btree_io.c | 4 ++-- fs/bcachefs/btree_journal_iter.c | 4 +--- fs/bcachefs/buckets.c | 29 +++++++++++------------------ fs/bcachefs/compress.c | 14 +++++++------- fs/bcachefs/debug.c | 6 +++--- fs/bcachefs/ec.c | 4 ++-- fs/bcachefs/fifo.h | 4 ++-- fs/bcachefs/journal.c | 4 ++-- fs/bcachefs/journal_io.c | 15 +++++++-------- fs/bcachefs/super.c | 8 ++++---- fs/bcachefs/util.c | 22 ---------------------- fs/bcachefs/util.h | 36 ++---------------------------------- 14 files changed, 49 insertions(+), 115 deletions(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index a8b393bc7567..72d24933dc19 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -60,7 +60,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) clear_btree_node_just_written(b); - kvpfree(b->data, btree_buf_bytes(b)); + kvfree(b->data); b->data = NULL; #ifdef __KERNEL__ kvfree(b->aux_data); @@ -94,7 +94,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) { BUG_ON(b->data || b->aux_data); - b->data = kvpmalloc(btree_buf_bytes(b), gfp); + b->data = kvmalloc(btree_buf_bytes(b), gfp); if (!b->data) return -BCH_ERR_ENOMEM_btree_node_mem_alloc; #ifdef __KERNEL__ @@ -107,7 +107,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) b->aux_data = NULL; #endif if (!b->aux_data) { - kvpfree(b->data, btree_buf_bytes(b)); + kvfree(b->data); b->data = NULL; return -BCH_ERR_ENOMEM_btree_node_mem_alloc; } @@ -408,7 +408,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) if (c->verify_data) list_move(&c->verify_data->list, &bc->live); - kvpfree(c->verify_ondisk, c->opts.btree_node_size); + kvfree(c->verify_ondisk); for (i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 3005b39d3a1a..f4872f1d6fc6 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -1193,9 +1193,7 @@ static void bch2_gc_free(struct bch_fs *c) genradix_free(&c->gc_stripes); for_each_member_device(c, ca) { - kvpfree(rcu_dereference_protected(ca->buckets_gc, 1), - sizeof(struct bucket_array) + - ca->mi.nbuckets * sizeof(struct bucket)); + kvfree(rcu_dereference_protected(ca->buckets_gc, 1)); ca->buckets_gc = NULL; free_percpu(ca->usage_gc); @@ -1494,7 +1492,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) { for_each_member_device(c, ca) { - struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + + struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket), GFP_KERNEL|__GFP_ZERO); if (!buckets) { diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index caf3eecfc801..767cdbe7f586 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -103,7 +103,7 @@ static void btree_bounce_free(struct bch_fs *c, size_t size, if (used_mempool) mempool_free(p, &c->btree_bounce_pool); else - vpfree(p, size); + kvfree(p); } static void *btree_bounce_alloc(struct bch_fs *c, size_t size, @@ -115,7 +115,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size, BUG_ON(size > c->opts.btree_node_size); *used_mempool = false; - p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); + p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT); if (!p) { *used_mempool = true; p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index b7ac93c8fdd8..3da65562fdb0 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -447,9 +447,7 @@ void bch2_journal_entries_free(struct bch_fs *c) struct genradix_iter iter; genradix_for_each(&c->journal_entries, iter, i) - if (*i) - kvpfree(*i, offsetof(struct journal_replay, j) + - vstruct_bytes(&(*i)->j)); + kvfree(*i); genradix_free(&c->journal_entries); } diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 54f7826ac498..7dca10ba70d2 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -1335,7 +1335,7 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu) struct bucket_gens *buckets = container_of(rcu, struct bucket_gens, rcu); - kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets); + kvfree(buckets); } int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) @@ -1345,16 +1345,16 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) bool resize = ca->bucket_gens != NULL; int ret; - if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, - GFP_KERNEL|__GFP_ZERO))) { + if (!(bucket_gens = kvmalloc(sizeof(struct bucket_gens) + nbuckets, + GFP_KERNEL|__GFP_ZERO))) { ret = -BCH_ERR_ENOMEM_bucket_gens; goto err; } if ((c->opts.buckets_nouse && - !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * - sizeof(unsigned long), - GFP_KERNEL|__GFP_ZERO)))) { + !(buckets_nouse = kvmalloc(BITS_TO_LONGS(nbuckets) * + sizeof(unsigned long), + GFP_KERNEL|__GFP_ZERO)))) { ret = -BCH_ERR_ENOMEM_buckets_nouse; goto err; } @@ -1397,8 +1397,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ret = 0; err: - kvpfree(buckets_nouse, - BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); + kvfree(buckets_nouse); if (bucket_gens) call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); @@ -1407,27 +1406,21 @@ err: void bch2_dev_buckets_free(struct bch_dev *ca) { - unsigned i; - - kvpfree(ca->buckets_nouse, - BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); - kvpfree(rcu_dereference_protected(ca->bucket_gens, 1), - sizeof(struct bucket_gens) + ca->mi.nbuckets); + kvfree(ca->buckets_nouse); + kvfree(rcu_dereference_protected(ca->bucket_gens, 1)); - for (i = 0; i < ARRAY_SIZE(ca->usage); i++) + for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) free_percpu(ca->usage[i]); kfree(ca->usage_base); } int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) { - unsigned i; - ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL); if (!ca->usage_base) return -BCH_ERR_ENOMEM_usage_init; - for (i = 0; i < ARRAY_SIZE(ca->usage); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) { ca->usage[i] = alloc_percpu(struct bch_dev_usage); if (!ca->usage[i]) return -BCH_ERR_ENOMEM_usage_init; diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 33df8cf86bd8..1410365a8891 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -601,13 +601,13 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) return 0; if (!mempool_initialized(&c->compression_bounce[READ]) && - mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], - 1, c->opts.encoded_extent_max)) + mempool_init_kvmalloc_pool(&c->compression_bounce[READ], + 1, c->opts.encoded_extent_max)) return -BCH_ERR_ENOMEM_compression_bounce_read_init; if (!mempool_initialized(&c->compression_bounce[WRITE]) && - mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], - 1, c->opts.encoded_extent_max)) + mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE], + 1, c->opts.encoded_extent_max)) return -BCH_ERR_ENOMEM_compression_bounce_write_init; for (i = compression_types; @@ -622,15 +622,15 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) if (mempool_initialized(&c->compress_workspace[i->type])) continue; - if (mempool_init_kvpmalloc_pool( + if (mempool_init_kvmalloc_pool( &c->compress_workspace[i->type], 1, i->compress_workspace)) return -BCH_ERR_ENOMEM_compression_workspace_init; } if (!mempool_initialized(&c->decompress_workspace) && - mempool_init_kvpmalloc_pool(&c->decompress_workspace, - 1, decompress_workspace_size)) + mempool_init_kvmalloc_pool(&c->decompress_workspace, + 1, decompress_workspace_size)) return -BCH_ERR_ENOMEM_decompression_workspace_init; return 0; diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 7bdba8507fc9..b1f147e6be4d 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -137,7 +137,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) mutex_lock(&c->verify_lock); if (!c->verify_ondisk) { - c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL); + c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL); if (!c->verify_ondisk) goto out; } @@ -199,7 +199,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, return; } - n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL); + n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL); if (!n_ondisk) { prt_printf(out, "memory allocation failure\n"); goto out; @@ -293,7 +293,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, out: if (bio) bio_put(bio); - kvpfree(n_ondisk, btree_buf_bytes(b)); + kvfree(n_ondisk); percpu_ref_put(&ca->io_ref); } diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index d503af270024..b98e2c2b8bf0 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -504,7 +504,7 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) unsigned i; for (i = 0; i < s->v.nr_blocks; i++) { - kvpfree(buf->data[i], buf->size << 9); + kvfree(buf->data[i]); buf->data[i] = NULL; } } @@ -531,7 +531,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf, memset(buf->valid, 0xFF, sizeof(buf->valid)); for (i = 0; i < v->nr_blocks; i++) { - buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL); + buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL); if (!buf->data[i]) goto err; } diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h index 66b945be10c2..d8153fe27037 100644 --- a/fs/bcachefs/fifo.h +++ b/fs/bcachefs/fifo.h @@ -24,12 +24,12 @@ struct { \ (fifo)->mask = (fifo)->size \ ? roundup_pow_of_two((fifo)->size) - 1 \ : 0; \ - (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \ + (fifo)->data = kvmalloc(fifo_buf_size(fifo), (_gfp)); \ }) #define free_fifo(fifo) \ do { \ - kvpfree((fifo)->data, fifo_buf_size(fifo)); \ + kvfree((fifo)->data); \ (fifo)->data = NULL; \ } while (0) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index fe5f7a944ad3..f5be8e417c8a 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1343,7 +1343,7 @@ void bch2_fs_journal_exit(struct journal *j) darray_exit(&j->early_journal_entries); for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) - kvpfree(j->buf[i].data, j->buf[i].buf_size); + kvfree(j->buf[i].data); free_fifo(&j->pin); } @@ -1372,7 +1372,7 @@ int bch2_fs_journal_init(struct journal *j) for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) { j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; - j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL); + j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL); if (!j->buf[i].data) return -BCH_ERR_ENOMEM_journal_buf; j->buf[i].idx = i; diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index cd8921a2c0da..c4f19b996b45 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -84,8 +84,7 @@ static void __journal_replay_free(struct bch_fs *c, BUG_ON(*p != i); *p = NULL; - kvpfree(i, offsetof(struct journal_replay, j) + - vstruct_bytes(&i->j)); + kvfree(i); } static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) @@ -196,7 +195,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, goto out; } replace: - i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); + i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); if (!i) return -BCH_ERR_ENOMEM_journal_entry_add; @@ -965,11 +964,11 @@ static int journal_read_buf_realloc(struct journal_read_buf *b, return -BCH_ERR_ENOMEM_journal_read_buf_realloc; new_size = roundup_pow_of_two(new_size); - n = kvpmalloc(new_size, GFP_KERNEL); + n = kvmalloc(new_size, GFP_KERNEL); if (!n) return -BCH_ERR_ENOMEM_journal_read_buf_realloc; - kvpfree(b->data, b->size); + kvfree(b->data); b->data = n; b->size = new_size; return 0; @@ -1195,7 +1194,7 @@ found: ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; out: bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); - kvpfree(buf.data, buf.size); + kvfree(buf.data); percpu_ref_put(&ca->io_ref); closure_return(cl); return; @@ -1576,7 +1575,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) return; - new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN); + new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN); if (!new_buf) return; @@ -1587,7 +1586,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) swap(buf->buf_size, new_size); spin_unlock(&j->lock); - kvpfree(new_buf, new_size); + kvfree(new_buf); } static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index b91efec11dfe..b17f2e199322 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -576,7 +576,7 @@ static void __bch2_fs_free(struct bch_fs *c) destroy_workqueue(c->btree_update_wq); bch2_free_super(&c->disk_sb); - kvpfree(c, sizeof(*c)); + kvfree(c); module_put(THIS_MODULE); } @@ -715,7 +715,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) unsigned i, iter_size; int ret = 0; - c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); + c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); if (!c) { c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc); goto out; @@ -882,8 +882,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) BIOSET_NEED_BVECS) || !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || !(c->online_reserved = alloc_percpu(u64)) || - mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, - c->opts.btree_node_size) || + mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, + c->opts.btree_node_size) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, sizeof(u64), GFP_KERNEL))) { diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 3a32faa86b5c..bd3687a726f7 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -1007,28 +1007,6 @@ void sort_cmp_size(void *base, size_t num, size_t size, } } -static void mempool_free_vp(void *element, void *pool_data) -{ - size_t size = (size_t) pool_data; - - vpfree(element, size); -} - -static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data) -{ - size_t size = (size_t) pool_data; - - return vpmalloc(size, gfp_mask); -} - -int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size) -{ - return size < PAGE_SIZE - ? mempool_init_kmalloc_pool(pool, min_nr, size) - : mempool_init(pool, min_nr, mempool_alloc_vp, - mempool_free_vp, (void *) size); -} - #if 0 void eytzinger1_test(void) { diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index b414736d59a5..7fed75c44cd5 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -53,38 +53,6 @@ static inline size_t buf_pages(void *p, size_t len) PAGE_SIZE); } -static inline void vpfree(void *p, size_t size) -{ - if (is_vmalloc_addr(p)) - vfree(p); - else - free_pages((unsigned long) p, get_order(size)); -} - -static inline void *vpmalloc(size_t size, gfp_t gfp_mask) -{ - return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, - get_order(size)) ?: - __vmalloc(size, gfp_mask); -} - -static inline void kvpfree(void *p, size_t size) -{ - if (size < PAGE_SIZE) - kfree(p); - else - vpfree(p, size); -} - -static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) -{ - return size < PAGE_SIZE - ? kmalloc(size, gfp_mask) - : vpmalloc(size, gfp_mask); -} - -int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t); - #define HEAP(type) \ struct { \ size_t size, used; \ @@ -97,13 +65,13 @@ struct { \ ({ \ (heap)->used = 0; \ (heap)->size = (_size); \ - (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\ + (heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\ (gfp)); \ }) #define free_heap(heap) \ do { \ - kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \ + kvfree((heap)->data); \ (heap)->data = NULL; \ } while (0) -- cgit v1.2.3 From e017047fdb3a449f45d73ea4c4e94465090b93a1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 4 Feb 2024 20:19:49 -0500 Subject: bcachefs: thread_with_stdio: eliminate double buffering The output buffer lock has to be a spinlock so that we can write to it from interrupt context, so we can't use a direct copy_to_user; this switches thread_with_file_read() to use fault_in_writeable() and copy_to_user_nofault(), similar to how thread_with_file_write() works. Signed-off-by: Kent Overstreet --- fs/bcachefs/thread_with_file.c | 58 +++++++++++++----------------------------- fs/bcachefs/thread_with_file.h | 1 - 2 files changed, 18 insertions(+), 41 deletions(-) diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index 9220d7de10db..8c3afb4c3204 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -67,16 +67,15 @@ err: static inline bool thread_with_stdio_has_output(struct thread_with_stdio *thr) { - return thr->stdio.output_buf.pos || - thr->output2.nr || - thr->thr.done; + return thr->stdio.output_buf.pos || thr->thr.done; } -static ssize_t thread_with_stdio_read(struct file *file, char __user *buf, +static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf, size_t len, loff_t *ppos) { struct thread_with_stdio *thr = container_of(file->private_data, struct thread_with_stdio, thr); + struct printbuf *buf = &thr->stdio.output_buf; size_t copied = 0, b; int ret = 0; @@ -89,44 +88,25 @@ static ssize_t thread_with_stdio_read(struct file *file, char __user *buf, if (ret) return ret; - if (thr->thr.done) - return 0; - - while (len) { - ret = darray_make_room(&thr->output2, thr->stdio.output_buf.pos); - if (ret) - break; - - spin_lock_irq(&thr->stdio.output_lock); - b = min_t(size_t, darray_room(thr->output2), thr->stdio.output_buf.pos); - - memcpy(&darray_top(thr->output2), thr->stdio.output_buf.buf, b); - memmove(thr->stdio.output_buf.buf, - thr->stdio.output_buf.buf + b, - thr->stdio.output_buf.pos - b); - - thr->output2.nr += b; - thr->stdio.output_buf.pos -= b; - spin_unlock_irq(&thr->stdio.output_lock); - - b = min(len, thr->output2.nr); - if (!b) - break; - - b -= copy_to_user(buf, thr->output2.data, b); - if (!b) { + while (len && buf->pos) { + if (fault_in_writeable(ubuf, len) == len) { ret = -EFAULT; break; } - copied += b; - buf += b; - len -= b; - - memmove(thr->output2.data, - thr->output2.data + b, - thr->output2.nr - b); - thr->output2.nr -= b; + spin_lock_irq(&thr->stdio.output_lock); + b = min_t(size_t, len, buf->pos); + + if (b && !copy_to_user_nofault(ubuf, buf->buf, b)) { + memmove(buf->buf, + buf->buf + b, + buf->pos - b); + buf->pos -= b; + ubuf += b; + len -= b; + copied += b; + } + spin_unlock_irq(&thr->stdio.output_lock); } return copied ?: ret; @@ -140,7 +120,6 @@ static int thread_with_stdio_release(struct inode *inode, struct file *file) bch2_thread_with_file_exit(&thr->thr); printbuf_exit(&thr->stdio.input_buf); printbuf_exit(&thr->stdio.output_buf); - darray_exit(&thr->output2); thr->exit(thr); return 0; } @@ -245,7 +224,6 @@ int bch2_run_thread_with_stdio(struct thread_with_stdio *thr, spin_lock_init(&thr->stdio.output_lock); init_waitqueue_head(&thr->stdio.output_wait); - darray_init(&thr->output2); thr->exit = exit; return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn); diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h index 05879c5048c8..b5098b52db70 100644 --- a/fs/bcachefs/thread_with_file.h +++ b/fs/bcachefs/thread_with_file.h @@ -20,7 +20,6 @@ int bch2_run_thread_with_file(struct thread_with_file *, struct thread_with_stdio { struct thread_with_file thr; struct stdio_redirect stdio; - DARRAY(char) output2; void (*exit)(struct thread_with_stdio *); }; -- cgit v1.2.3 From 60e1baa872a1550ea7c083977c817ca2ede04eaf Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 4 Feb 2024 22:20:40 -0500 Subject: bcachefs: thread_with_stdio: convert to darray - eliminate the dependency on printbufs, so that we can lift thread_with_file for use in xfs - add a nonblocking parameter to stdio_redirect_printf(), and either block if the buffer is full or drop it on the floor - don't buffer infinitely Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 9 +- fs/bcachefs/thread_with_file.c | 229 ++++++++++++++++++++++------------- fs/bcachefs/thread_with_file.h | 7 +- fs/bcachefs/thread_with_file_types.h | 15 ++- 4 files changed, 160 insertions(+), 100 deletions(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index b17f2e199322..eeeb01156444 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -56,6 +56,7 @@ #include "super.h" #include "super-io.h" #include "sysfs.h" +#include "thread_with_file.h" #include "trace.h" #include @@ -95,16 +96,10 @@ void __bch2_print(struct bch_fs *c, const char *fmt, ...) if (likely(!stdio)) { vprintk(fmt, args); } else { - unsigned long flags; - if (fmt[0] == KERN_SOH[0]) fmt += 2; - spin_lock_irqsave(&stdio->output_lock, flags); - prt_vprintf(&stdio->output_buf, fmt, args); - spin_unlock_irqrestore(&stdio->output_lock, flags); - - wake_up(&stdio->output_wait); + bch2_stdio_redirect_vprintf(stdio, true, fmt, args); } va_end(args); } diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index 8c3afb4c3204..ca81d3fec3ee 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -2,7 +2,6 @@ #ifndef NO_BCACHEFS_FS #include "bcachefs.h" -#include "printbuf.h" #include "thread_with_file.h" #include @@ -65,48 +64,74 @@ err: return ret; } -static inline bool thread_with_stdio_has_output(struct thread_with_stdio *thr) +/* stdio_redirect */ + +static bool stdio_redirect_has_input(struct stdio_redirect *stdio) +{ + return stdio->input.buf.nr || stdio->done; +} + +static bool stdio_redirect_has_output(struct stdio_redirect *stdio) { - return thr->stdio.output_buf.pos || thr->thr.done; + return stdio->output.buf.nr || stdio->done; } +#define WRITE_BUFFER 4096 + +static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio) +{ + return stdio->input.buf.nr < WRITE_BUFFER || stdio->done; +} + +static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio) +{ + return stdio->output.buf.nr < WRITE_BUFFER || stdio->done; +} + +static void stdio_buf_init(struct stdio_buf *buf) +{ + spin_lock_init(&buf->lock); + init_waitqueue_head(&buf->wait); + darray_init(&buf->buf); +} + +/* thread_with_stdio */ + static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf, size_t len, loff_t *ppos) { struct thread_with_stdio *thr = container_of(file->private_data, struct thread_with_stdio, thr); - struct printbuf *buf = &thr->stdio.output_buf; + struct stdio_buf *buf = &thr->stdio.output; size_t copied = 0, b; int ret = 0; - if ((file->f_flags & O_NONBLOCK) && - !thread_with_stdio_has_output(thr)) + if (!(file->f_flags & O_NONBLOCK)) { + ret = wait_event_interruptible(buf->wait, stdio_redirect_has_output(&thr->stdio)); + if (ret) + return ret; + } else if (!stdio_redirect_has_output(&thr->stdio)) return -EAGAIN; - ret = wait_event_interruptible(thr->stdio.output_wait, - thread_with_stdio_has_output(thr)); - if (ret) - return ret; - - while (len && buf->pos) { + while (len && buf->buf.nr) { if (fault_in_writeable(ubuf, len) == len) { ret = -EFAULT; break; } - spin_lock_irq(&thr->stdio.output_lock); - b = min_t(size_t, len, buf->pos); + spin_lock_irq(&buf->lock); + b = min_t(size_t, len, buf->buf.nr); - if (b && !copy_to_user_nofault(ubuf, buf->buf, b)) { - memmove(buf->buf, - buf->buf + b, - buf->pos - b); - buf->pos -= b; + if (b && !copy_to_user_nofault(ubuf, buf->buf.data, b)) { ubuf += b; len -= b; copied += b; + buf->buf.nr -= b; + memmove(buf->buf.data, + buf->buf.data + b, + buf->buf.nr); } - spin_unlock_irq(&thr->stdio.output_lock); + spin_unlock_irq(&buf->lock); } return copied ?: ret; @@ -118,25 +143,18 @@ static int thread_with_stdio_release(struct inode *inode, struct file *file) container_of(file->private_data, struct thread_with_stdio, thr); bch2_thread_with_file_exit(&thr->thr); - printbuf_exit(&thr->stdio.input_buf); - printbuf_exit(&thr->stdio.output_buf); + darray_exit(&thr->stdio.input.buf); + darray_exit(&thr->stdio.output.buf); thr->exit(thr); return 0; } -#define WRITE_BUFFER 4096 - -static inline bool thread_with_stdio_has_input_space(struct thread_with_stdio *thr) -{ - return thr->stdio.input_buf.pos < WRITE_BUFFER || thr->thr.done; -} - static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf, size_t len, loff_t *ppos) { struct thread_with_stdio *thr = container_of(file->private_data, struct thread_with_stdio, thr); - struct printbuf *buf = &thr->stdio.input_buf; + struct stdio_buf *buf = &thr->stdio.input; size_t copied = 0; ssize_t ret = 0; @@ -152,29 +170,29 @@ static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubu break; } - spin_lock(&thr->stdio.input_lock); - if (buf->pos < WRITE_BUFFER) - bch2_printbuf_make_room(buf, min(b, WRITE_BUFFER - buf->pos)); - b = min(len, printbuf_remaining_size(buf)); + spin_lock(&buf->lock); + if (buf->buf.nr < WRITE_BUFFER) + darray_make_room_gfp(&buf->buf, min(b, WRITE_BUFFER - buf->buf.nr), __GFP_NOWARN); + b = min(len, darray_room(buf->buf)); - if (b && !copy_from_user_nofault(&buf->buf[buf->pos], ubuf, b)) { - ubuf += b; - len -= b; - copied += b; - buf->pos += b; + if (b && !copy_from_user_nofault(&buf->buf.data[buf->buf.nr], ubuf, b)) { + buf->buf.nr += b; + ubuf += b; + len -= b; + copied += b; } - spin_unlock(&thr->stdio.input_lock); + spin_unlock(&buf->lock); if (b) { - wake_up(&thr->stdio.input_wait); + wake_up(&buf->wait); } else { if ((file->f_flags & O_NONBLOCK)) { ret = -EAGAIN; break; } - ret = wait_event_interruptible(thr->stdio.input_wait, - thread_with_stdio_has_input_space(thr)); + ret = wait_event_interruptible(buf->wait, + stdio_redirect_has_input_space(&thr->stdio)); if (ret) break; } @@ -188,14 +206,14 @@ static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_stru struct thread_with_stdio *thr = container_of(file->private_data, struct thread_with_stdio, thr); - poll_wait(file, &thr->stdio.output_wait, wait); - poll_wait(file, &thr->stdio.input_wait, wait); + poll_wait(file, &thr->stdio.output.wait, wait); + poll_wait(file, &thr->stdio.input.wait, wait); __poll_t mask = 0; - if (thread_with_stdio_has_output(thr)) + if (stdio_redirect_has_output(&thr->stdio)) mask |= EPOLLIN; - if (thread_with_stdio_has_input_space(thr)) + if (stdio_redirect_has_input_space(&thr->stdio)) mask |= EPOLLOUT; if (thr->thr.done) mask |= EPOLLHUP|EPOLLERR; @@ -203,75 +221,112 @@ static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_stru } static const struct file_operations thread_with_stdio_fops = { - .release = thread_with_stdio_release, + .llseek = no_llseek, .read = thread_with_stdio_read, .write = thread_with_stdio_write, .poll = thread_with_stdio_poll, - .llseek = no_llseek, + .release = thread_with_stdio_release, }; int bch2_run_thread_with_stdio(struct thread_with_stdio *thr, void (*exit)(struct thread_with_stdio *), int (*fn)(void *)) { - thr->stdio.input_buf = PRINTBUF; - thr->stdio.input_buf.atomic++; - spin_lock_init(&thr->stdio.input_lock); - init_waitqueue_head(&thr->stdio.input_wait); - - thr->stdio.output_buf = PRINTBUF; - thr->stdio.output_buf.atomic++; - spin_lock_init(&thr->stdio.output_lock); - init_waitqueue_head(&thr->stdio.output_wait); - + stdio_buf_init(&thr->stdio.input); + stdio_buf_init(&thr->stdio.output); thr->exit = exit; return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn); } -int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *buf, size_t len) +int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len) { - wait_event(stdio->input_wait, - stdio->input_buf.pos || stdio->done); + struct stdio_buf *buf = &stdio->input; + wait_event(buf->wait, stdio_redirect_has_input(stdio)); if (stdio->done) return -1; - spin_lock(&stdio->input_lock); - int ret = min(len, stdio->input_buf.pos); - stdio->input_buf.pos -= ret; - memcpy(buf, stdio->input_buf.buf, ret); - memmove(stdio->input_buf.buf, - stdio->input_buf.buf + ret, - stdio->input_buf.pos); - spin_unlock(&stdio->input_lock); + spin_lock(&buf->lock); + int ret = min(len, buf->buf.nr); + buf->buf.nr -= ret; + memcpy(ubuf, buf->buf.data, ret); + memmove(buf->buf.data, + buf->buf.data + ret, + buf->buf.nr); + spin_unlock(&buf->lock); - wake_up(&stdio->input_wait); + wake_up(&buf->wait); return ret; } -int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *buf, size_t len) +int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *ubuf, size_t len) { - wait_event(stdio->input_wait, - stdio->input_buf.pos || stdio->done); + struct stdio_buf *buf = &stdio->input; + wait_event(buf->wait, stdio_redirect_has_input(stdio)); if (stdio->done) return -1; - spin_lock(&stdio->input_lock); - int ret = min(len, stdio->input_buf.pos); - char *n = memchr(stdio->input_buf.buf, '\n', ret); - if (n) - ret = min(ret, n + 1 - stdio->input_buf.buf); - stdio->input_buf.pos -= ret; - memcpy(buf, stdio->input_buf.buf, ret); - memmove(stdio->input_buf.buf, - stdio->input_buf.buf + ret, - stdio->input_buf.pos); - spin_unlock(&stdio->input_lock); - - wake_up(&stdio->input_wait); + spin_lock(&buf->lock); + int ret = min(len, buf->buf.nr); + char *n = memchr(buf->buf.data, '\n', ret); + if (!n) + ret = min(ret, n + 1 - buf->buf.data); + buf->buf.nr -= ret; + memcpy(ubuf, buf->buf.data, ret); + memmove(buf->buf.data, + buf->buf.data + ret, + buf->buf.nr); + spin_unlock(&buf->lock); + + wake_up(&buf->wait); return ret; } +__printf(3, 0) +static void bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args) +{ + size_t len; + + do { + va_list args2; + va_copy(args2, args); + + len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2); + } while (len + 1 > darray_room(*out) && !darray_make_room_gfp(out, len + 1, gfp)); + + out->nr += min(len, darray_room(*out)); +} + +void bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking, + const char *fmt, va_list args) +{ + struct stdio_buf *buf = &stdio->output; + unsigned long flags; + + if (!nonblocking) + wait_event(buf->wait, stdio_redirect_has_output_space(stdio)); + else if (!stdio_redirect_has_output_space(stdio)) + return; + if (stdio->done) + return; + + spin_lock_irqsave(&buf->lock, flags); + bch2_darray_vprintf(&buf->buf, nonblocking ? __GFP_NOWARN : GFP_KERNEL, fmt, args); + spin_unlock_irqrestore(&buf->lock, flags); + + wake_up(&buf->wait); +} + +void bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking, + const char *fmt, ...) +{ + + va_list args; + va_start(args, fmt); + bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args); + va_end(args); +} + #endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h index b5098b52db70..4243c7c5ad3f 100644 --- a/fs/bcachefs/thread_with_file.h +++ b/fs/bcachefs/thread_with_file.h @@ -27,8 +27,8 @@ static inline void thread_with_stdio_done(struct thread_with_stdio *thr) { thr->thr.done = true; thr->stdio.done = true; - wake_up(&thr->stdio.input_wait); - wake_up(&thr->stdio.output_wait); + wake_up(&thr->stdio.input.wait); + wake_up(&thr->stdio.output.wait); } int bch2_run_thread_with_stdio(struct thread_with_stdio *, @@ -37,4 +37,7 @@ int bch2_run_thread_with_stdio(struct thread_with_stdio *, int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t); int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t); +__printf(3, 0) void bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list); +__printf(3, 4) void bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...); + #endif /* _BCACHEFS_THREAD_WITH_FILE_H */ diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h index 90b5e645e98c..e0daf4eec341 100644 --- a/fs/bcachefs/thread_with_file_types.h +++ b/fs/bcachefs/thread_with_file_types.h @@ -2,14 +2,21 @@ #ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H #define _BCACHEFS_THREAD_WITH_FILE_TYPES_H +#include "darray.h" + +struct stdio_buf { + spinlock_t lock; + wait_queue_head_t wait; + darray_char buf; +}; + struct stdio_redirect { - spinlock_t output_lock; - wait_queue_head_t output_wait; - struct printbuf output_buf; + struct stdio_buf input; + struct stdio_buf output; spinlock_t input_lock; wait_queue_head_t input_wait; - struct printbuf input_buf; + darray_char input_buf; bool done; }; -- cgit v1.2.3 From a6777ca4ff237d097b3d6186283eb2d2f24071d1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 4 Feb 2024 22:49:34 -0500 Subject: bcachefs: thread_with_stdio: kill thread_with_stdio_done() Move the cleanup code to a wrapper function, where we can call it after the thread_with_stdio fn exits. Signed-off-by: Kent Overstreet --- fs/bcachefs/chardev.c | 14 ++++---------- fs/bcachefs/thread_with_file.c | 20 +++++++++++++++++--- fs/bcachefs/thread_with_file.h | 11 ++--------- 3 files changed, 23 insertions(+), 22 deletions(-) diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 226b39c17667..11711f54057e 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -155,17 +155,14 @@ static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) kfree(thr); } -static int bch2_fsck_offline_thread_fn(void *arg) +static void bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) { - struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr); + struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts); thr->thr.thr.ret = PTR_ERR_OR_ZERO(c); if (!thr->thr.thr.ret) bch2_fs_stop(c); - - thread_with_stdio_done(&thr->thr); - return 0; } static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) @@ -763,9 +760,9 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, return ret; } -static int bch2_fsck_online_thread_fn(void *arg) +static void bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) { - struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr); + struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); struct bch_fs *c = thr->c; c->stdio_filter = current; @@ -793,11 +790,8 @@ static int bch2_fsck_online_thread_fn(void *arg) c->stdio_filter = NULL; c->opts.fix_errors = old_fix_errors; - thread_with_stdio_done(&thr->thr); - up(&c->online_fsck_mutex); bch2_ro_ref_put(c); - return 0; } static long bch2_ioctl_fsck_online(struct bch_fs *c, diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index ca81d3fec3ee..eb8ab4c47a94 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -228,15 +228,29 @@ static const struct file_operations thread_with_stdio_fops = { .release = thread_with_stdio_release, }; +static int thread_with_stdio_fn(void *arg) +{ + struct thread_with_stdio *thr = arg; + + thr->fn(thr); + + thr->thr.done = true; + thr->stdio.done = true; + wake_up(&thr->stdio.input.wait); + wake_up(&thr->stdio.output.wait); + return 0; +} + int bch2_run_thread_with_stdio(struct thread_with_stdio *thr, void (*exit)(struct thread_with_stdio *), - int (*fn)(void *)) + void (*fn)(struct thread_with_stdio *)) { stdio_buf_init(&thr->stdio.input); stdio_buf_init(&thr->stdio.output); - thr->exit = exit; + thr->exit = exit; + thr->fn = fn; - return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn); + return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn); } int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len) diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h index 4243c7c5ad3f..66212fcae226 100644 --- a/fs/bcachefs/thread_with_file.h +++ b/fs/bcachefs/thread_with_file.h @@ -21,19 +21,12 @@ struct thread_with_stdio { struct thread_with_file thr; struct stdio_redirect stdio; void (*exit)(struct thread_with_stdio *); + void (*fn)(struct thread_with_stdio *); }; -static inline void thread_with_stdio_done(struct thread_with_stdio *thr) -{ - thr->thr.done = true; - thr->stdio.done = true; - wake_up(&thr->stdio.input.wait); - wake_up(&thr->stdio.output.wait); -} - int bch2_run_thread_with_stdio(struct thread_with_stdio *, void (*exit)(struct thread_with_stdio *), - int (*fn)(void *)); + void (*fn)(struct thread_with_stdio *)); int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t); int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t); -- cgit v1.2.3 From f704f108af79c1ccbc1984cf0fd5e9f30102a718 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 4 Feb 2024 22:56:16 -0500 Subject: bcachefs: thread_with_stdio: fix bch2_stdio_redirect_readline() This fixes a bug where we'd return data without waiting for a newline, if data was present but a newline was not. Signed-off-by: Kent Overstreet --- fs/bcachefs/thread_with_file.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index eb8ab4c47a94..830efb06ef0b 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -277,25 +277,36 @@ int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t le int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *ubuf, size_t len) { struct stdio_buf *buf = &stdio->input; - + size_t copied = 0; + ssize_t ret = 0; +again: wait_event(buf->wait, stdio_redirect_has_input(stdio)); - if (stdio->done) - return -1; + if (stdio->done) { + ret = -1; + goto out; + } spin_lock(&buf->lock); - int ret = min(len, buf->buf.nr); - char *n = memchr(buf->buf.data, '\n', ret); - if (!n) - ret = min(ret, n + 1 - buf->buf.data); - buf->buf.nr -= ret; - memcpy(ubuf, buf->buf.data, ret); + size_t b = min(len, buf->buf.nr); + char *n = memchr(buf->buf.data, '\n', b); + if (n) + b = min_t(size_t, b, n + 1 - buf->buf.data); + buf->buf.nr -= b; + memcpy(ubuf, buf->buf.data, b); memmove(buf->buf.data, - buf->buf.data + ret, + buf->buf.data + b, buf->buf.nr); + ubuf += b; + len -= b; + copied += b; spin_unlock(&buf->lock); wake_up(&buf->wait); - return ret; + + if (!n && len) + goto again; +out: + return copied ?: ret; } __printf(3, 0) -- cgit v1.2.3 From 032b3fd0571a8d7e22c7d8c97bdfc76f17717de2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 3 Feb 2024 15:43:16 -0500 Subject: bcachefs: Thread with file documentation Signed-off-by: Kent Overstreet --- fs/bcachefs/thread_with_file.c | 15 ++++++++------- fs/bcachefs/thread_with_file.h | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index 830efb06ef0b..dde9679b68b4 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -76,16 +76,16 @@ static bool stdio_redirect_has_output(struct stdio_redirect *stdio) return stdio->output.buf.nr || stdio->done; } -#define WRITE_BUFFER 4096 +#define STDIO_REDIRECT_BUFSIZE 4096 static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio) { - return stdio->input.buf.nr < WRITE_BUFFER || stdio->done; + return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done; } static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio) { - return stdio->output.buf.nr < WRITE_BUFFER || stdio->done; + return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done; } static void stdio_buf_init(struct stdio_buf *buf) @@ -171,11 +171,12 @@ static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubu } spin_lock(&buf->lock); - if (buf->buf.nr < WRITE_BUFFER) - darray_make_room_gfp(&buf->buf, min(b, WRITE_BUFFER - buf->buf.nr), __GFP_NOWARN); + if (buf->buf.nr < STDIO_REDIRECT_BUFSIZE) + darray_make_room_gfp(&buf->buf, + min(b, STDIO_REDIRECT_BUFSIZE - buf->buf.nr), GFP_NOWAIT); b = min(len, darray_room(buf->buf)); - if (b && !copy_from_user_nofault(&buf->buf.data[buf->buf.nr], ubuf, b)) { + if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) { buf->buf.nr += b; ubuf += b; len -= b; @@ -338,7 +339,7 @@ void bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking, return; spin_lock_irqsave(&buf->lock, flags); - bch2_darray_vprintf(&buf->buf, nonblocking ? __GFP_NOWARN : GFP_KERNEL, fmt, args); + bch2_darray_vprintf(&buf->buf, nonblocking ? GFP_NOWAIT : GFP_KERNEL, fmt, args); spin_unlock_irqrestore(&buf->lock, flags); wake_up(&buf->wait); diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h index 66212fcae226..f06f8ff19a79 100644 --- a/fs/bcachefs/thread_with_file.h +++ b/fs/bcachefs/thread_with_file.h @@ -4,6 +4,38 @@ #include "thread_with_file_types.h" +/* + * Thread with file: Run a kthread and connect it to a file descriptor, so that + * it can be interacted with via fd read/write methods and closing the file + * descriptor stops the kthread. + * + * We have two different APIs: + * + * thread_with_file, the low level version. + * You get to define the full file_operations, including your release function, + * which means that you must call bch2_thread_with_file_exit() from your + * .release method + * + * thread_with_stdio, the higher level version + * This implements full piping of input and output, including .poll. + * + * Notes on behaviour: + * - kthread shutdown behaves like writing or reading from a pipe that has been + * closed + * - Input and output buffers are 4096 bytes, although buffers may in some + * situations slightly exceed that limit so as to avoid chopping off a + * message in the middle in nonblocking mode. + * - Input/output buffers are lazily allocated, with GFP_NOWAIT allocations - + * should be fine but might change in future revisions. + * - Output buffer may grow past 4096 bytes to deal with messages that are + * bigger than 4096 bytes + * - Writing may be done blocking or nonblocking; in nonblocking mode, we only + * drop entire messages. + * + * To write, use stdio_redirect_printf() + * To read, use stdio_redirect_read() or stdio_redirect_readline() + */ + struct task_struct; struct thread_with_file { -- cgit v1.2.3 From 8f9320d3a3993e76c8097fb8567ba5f3eb2cb91b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 8 Feb 2024 20:27:06 -0500 Subject: bcachefs: thread_with_stdio: Mark completed in ->release() This fixes stdio_redirect_read() getting stuck, not noticing that the pipe has been closed. Signed-off-by: Kent Overstreet --- fs/bcachefs/thread_with_file.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index dde9679b68b4..83186e824e88 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -97,6 +97,14 @@ static void stdio_buf_init(struct stdio_buf *buf) /* thread_with_stdio */ +static void thread_with_stdio_done(struct thread_with_stdio *thr) +{ + thr->thr.done = true; + thr->stdio.done = true; + wake_up(&thr->stdio.input.wait); + wake_up(&thr->stdio.output.wait); +} + static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf, size_t len, loff_t *ppos) { @@ -142,6 +150,7 @@ static int thread_with_stdio_release(struct inode *inode, struct file *file) struct thread_with_stdio *thr = container_of(file->private_data, struct thread_with_stdio, thr); + thread_with_stdio_done(thr); bch2_thread_with_file_exit(&thr->thr); darray_exit(&thr->stdio.input.buf); darray_exit(&thr->stdio.output.buf); @@ -235,10 +244,7 @@ static int thread_with_stdio_fn(void *arg) thr->fn(thr); - thr->thr.done = true; - thr->stdio.done = true; - wake_up(&thr->stdio.input.wait); - wake_up(&thr->stdio.output.wait); + thread_with_stdio_done(thr); return 0; } -- cgit v1.2.3 From 5c3273ec3c6af356b29ff50a654f0dc33bf25a83 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 9 Feb 2024 01:04:38 -0500 Subject: kernel/hung_task.c: export sysctl_hung_task_timeout_secs needed for thread_with_file; also rare but not unheard of to need this in module code, when blocking on user input. one workaround used by some code is wait_event_interruptible() - but that can be buggy if the outer context isn't expecting unwinding. Signed-off-by: Kent Overstreet Cc: Andrew Morton Cc: fuyuanli --- kernel/hung_task.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 9a24574988d2..b2fc2727d654 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -43,6 +43,7 @@ static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; * Zero means infinite timeout - no checking done: */ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; +EXPORT_SYMBOL_GPL(sysctl_hung_task_timeout_secs); /* * Zero (default value) means use sysctl_hung_task_timeout_secs: -- cgit v1.2.3 From a5a650d6472f238191d98d8aba7536a9fb3d9f99 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 8 Feb 2024 20:41:34 -0500 Subject: bcachefs: thread_with_stdio: suppress hung task warning Signed-off-by: Kent Overstreet --- fs/bcachefs/thread_with_file.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index 83186e824e88..96ac2b2c60e1 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -9,6 +9,7 @@ #include #include #include +#include void bch2_thread_with_file_exit(struct thread_with_file *thr) { @@ -264,7 +265,15 @@ int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t le { struct stdio_buf *buf = &stdio->input; - wait_event(buf->wait, stdio_redirect_has_input(stdio)); + /* + * we're waiting on user input (or for the file descriptor to be + * closed), don't want a hung task warning: + */ + do { + wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio), + sysctl_hung_task_timeout_secs * HZ / 2); + } while (!stdio_redirect_has_input(stdio)); + if (stdio->done) return -1; @@ -287,7 +296,11 @@ int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *ubuf, size_ size_t copied = 0; ssize_t ret = 0; again: - wait_event(buf->wait, stdio_redirect_has_input(stdio)); + do { + wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio), + sysctl_hung_task_timeout_secs * HZ / 2); + } while (!stdio_redirect_has_input(stdio)); + if (stdio->done) { ret = -1; goto out; -- cgit v1.2.3 From fcb1620edd4d59eff4b3466be1d61263cea958a8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 7 Feb 2024 11:43:32 -0800 Subject: bcachefs: thread_with_file: allow creation of readonly files Create a new run_thread_with_stdout function that opens a file in O_RDONLY mode so that the kernel can write things to userspace but userspace cannot write to the kernel. This will be used to convey xfs health event information to userspace. Signed-off-by: Darrick J. Wong Signed-off-by: Kent Overstreet --- fs/bcachefs/thread_with_file.c | 36 ++++++++++++++++++++++++++++++++++++ fs/bcachefs/thread_with_file.h | 3 +++ 2 files changed, 39 insertions(+) diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index 96ac2b2c60e1..d1cd5f4ab06e 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -231,6 +231,22 @@ static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_stru return mask; } +static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_struct *wait) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + + poll_wait(file, &thr->stdio.output.wait, wait); + + __poll_t mask = 0; + + if (stdio_redirect_has_output(&thr->stdio)) + mask |= EPOLLIN; + if (thr->thr.done) + mask |= EPOLLHUP|EPOLLERR; + return mask; +} + static const struct file_operations thread_with_stdio_fops = { .llseek = no_llseek, .read = thread_with_stdio_read, @@ -239,6 +255,13 @@ static const struct file_operations thread_with_stdio_fops = { .release = thread_with_stdio_release, }; +static const struct file_operations thread_with_stdout_fops = { + .llseek = no_llseek, + .read = thread_with_stdio_read, + .poll = thread_with_stdout_poll, + .release = thread_with_stdio_release, +}; + static int thread_with_stdio_fn(void *arg) { struct thread_with_stdio *thr = arg; @@ -261,6 +284,19 @@ int bch2_run_thread_with_stdio(struct thread_with_stdio *thr, return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn); } +int bch2_run_thread_with_stdout(struct thread_with_stdio *thr, + void (*exit)(struct thread_with_stdio *), + void (*fn)(struct thread_with_stdio *)) +{ + stdio_buf_init(&thr->stdio.input); + stdio_buf_init(&thr->stdio.output); + thr->exit = exit; + thr->fn = fn; + + return bch2_run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn); +} +EXPORT_SYMBOL_GPL(bch2_run_thread_with_stdout); + int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len) { struct stdio_buf *buf = &stdio->input; diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h index f06f8ff19a79..2b687723d6b9 100644 --- a/fs/bcachefs/thread_with_file.h +++ b/fs/bcachefs/thread_with_file.h @@ -59,6 +59,9 @@ struct thread_with_stdio { int bch2_run_thread_with_stdio(struct thread_with_stdio *, void (*exit)(struct thread_with_stdio *), void (*fn)(struct thread_with_stdio *)); +int bch2_run_thread_with_stdout(struct thread_with_stdio *, + void (*exit)(struct thread_with_stdio *), + void (*fn)(struct thread_with_stdio *)); int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t); int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t); -- cgit v1.2.3 From 1cbae651e5c875f4d128211b3c732ee545f45424 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 7 Feb 2024 11:39:03 -0800 Subject: bcachefs: thread_with_file: fix various printf problems Experimentally fix some problems with stdio_redirect_vprintf by creating a MOO variant with which we can experiment. We can't do a GFP_KERNEL allocation while holding the spinlock, and I don't like how the printf function can silently truncate the output if memory allocation fails. Signed-off-by: Darrick J. Wong Signed-off-by: Kent Overstreet --- fs/bcachefs/thread_with_file.c | 53 ++++++++++++++++++++++++++++-------------- fs/bcachefs/thread_with_file.h | 4 ++-- 2 files changed, 37 insertions(+), 20 deletions(-) diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index d1cd5f4ab06e..fc4f97c56021 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -366,48 +366,65 @@ out: } __printf(3, 0) -static void bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args) +static ssize_t bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args) { - size_t len; + ssize_t ret; do { va_list args2; - va_copy(args2, args); + size_t len; + va_copy(args2, args); len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2); - } while (len + 1 > darray_room(*out) && !darray_make_room_gfp(out, len + 1, gfp)); + if (len + 1 <= darray_room(*out)) { + out->nr += len; + return len; + } - out->nr += min(len, darray_room(*out)); + ret = darray_make_room_gfp(out, len + 1, gfp); + } while (ret == 0); + + return ret; } -void bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking, - const char *fmt, va_list args) +ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking, + const char *fmt, va_list args) { struct stdio_buf *buf = &stdio->output; unsigned long flags; + ssize_t ret; - if (!nonblocking) - wait_event(buf->wait, stdio_redirect_has_output_space(stdio)); - else if (!stdio_redirect_has_output_space(stdio)) - return; - if (stdio->done) - return; - +again: spin_lock_irqsave(&buf->lock, flags); - bch2_darray_vprintf(&buf->buf, nonblocking ? GFP_NOWAIT : GFP_KERNEL, fmt, args); + ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args); spin_unlock_irqrestore(&buf->lock, flags); + if (ret < 0) { + if (nonblocking) + return -EAGAIN; + + ret = wait_event_interruptible(buf->wait, + stdio_redirect_has_output_space(stdio)); + if (ret) + return ret; + goto again; + } + wake_up(&buf->wait); + return ret; } -void bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking, +ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking, const char *fmt, ...) { - va_list args; + ssize_t ret; + va_start(args, fmt); - bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args); + ret = bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args); va_end(args); + + return ret; } #endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h index 2b687723d6b9..e20f2d17ee59 100644 --- a/fs/bcachefs/thread_with_file.h +++ b/fs/bcachefs/thread_with_file.h @@ -65,7 +65,7 @@ int bch2_run_thread_with_stdout(struct thread_with_stdio *, int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t); int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t); -__printf(3, 0) void bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list); -__printf(3, 4) void bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...); +__printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list); +__printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...); #endif /* _BCACHEFS_THREAD_WITH_FILE_H */ -- cgit v1.2.3 From ab6752e24ef1eb4ef2cf35c4aa87eb1c9854e1a1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sat, 10 Feb 2024 11:23:01 -0800 Subject: bcachefs: thread_with_file: create ops structure for thread_with_stdio Create an ops structure so we can add more file-based functionality in the next few patches. Signed-off-by: Darrick J. Wong Signed-off-by: Kent Overstreet --- fs/bcachefs/chardev.c | 18 ++++++++++++------ fs/bcachefs/thread_with_file.c | 16 ++++++---------- fs/bcachefs/thread_with_file.h | 16 ++++++++++------ 3 files changed, 28 insertions(+), 22 deletions(-) diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 11711f54057e..b1a460729a4b 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -165,6 +165,11 @@ static void bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) bch2_fs_stop(c); } +static const struct thread_with_stdio_ops bch2_offline_fsck_ops = { + .exit = bch2_fsck_thread_exit, + .fn = bch2_fsck_offline_thread_fn, +}; + static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) { struct bch_ioctl_fsck_offline arg; @@ -217,9 +222,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); - ret = bch2_run_thread_with_stdio(&thr->thr, - bch2_fsck_thread_exit, - bch2_fsck_offline_thread_fn); + ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops); err: if (ret < 0) { if (thr) @@ -794,6 +797,11 @@ static void bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) bch2_ro_ref_put(c); } +static const struct thread_with_stdio_ops bch2_online_fsck_ops = { + .exit = bch2_fsck_thread_exit, + .fn = bch2_fsck_online_thread_fn, +}; + static long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) { @@ -834,9 +842,7 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c, goto err; } - ret = bch2_run_thread_with_stdio(&thr->thr, - bch2_fsck_thread_exit, - bch2_fsck_online_thread_fn); + ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops); err: if (ret < 0) { bch_err_fn(c, ret); diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index fc4f97c56021..d298a8e8d2b9 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -155,7 +155,7 @@ static int thread_with_stdio_release(struct inode *inode, struct file *file) bch2_thread_with_file_exit(&thr->thr); darray_exit(&thr->stdio.input.buf); darray_exit(&thr->stdio.output.buf); - thr->exit(thr); + thr->ops->exit(thr); return 0; } @@ -266,32 +266,28 @@ static int thread_with_stdio_fn(void *arg) { struct thread_with_stdio *thr = arg; - thr->fn(thr); + thr->ops->fn(thr); thread_with_stdio_done(thr); return 0; } int bch2_run_thread_with_stdio(struct thread_with_stdio *thr, - void (*exit)(struct thread_with_stdio *), - void (*fn)(struct thread_with_stdio *)) + const struct thread_with_stdio_ops *ops) { stdio_buf_init(&thr->stdio.input); stdio_buf_init(&thr->stdio.output); - thr->exit = exit; - thr->fn = fn; + thr->ops = ops; return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn); } int bch2_run_thread_with_stdout(struct thread_with_stdio *thr, - void (*exit)(struct thread_with_stdio *), - void (*fn)(struct thread_with_stdio *)) + const struct thread_with_stdio_ops *ops) { stdio_buf_init(&thr->stdio.input); stdio_buf_init(&thr->stdio.output); - thr->exit = exit; - thr->fn = fn; + thr->ops = ops; return bch2_run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn); } diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h index e20f2d17ee59..5361611edb4c 100644 --- a/fs/bcachefs/thread_with_file.h +++ b/fs/bcachefs/thread_with_file.h @@ -49,19 +49,23 @@ int bch2_run_thread_with_file(struct thread_with_file *, const struct file_operations *, int (*fn)(void *)); +struct thread_with_stdio; + +struct thread_with_stdio_ops { + void (*exit)(struct thread_with_stdio *); + void (*fn)(struct thread_with_stdio *); +}; + struct thread_with_stdio { struct thread_with_file thr; struct stdio_redirect stdio; - void (*exit)(struct thread_with_stdio *); - void (*fn)(struct thread_with_stdio *); + const struct thread_with_stdio_ops *ops; }; int bch2_run_thread_with_stdio(struct thread_with_stdio *, - void (*exit)(struct thread_with_stdio *), - void (*fn)(struct thread_with_stdio *)); + const struct thread_with_stdio_ops *); int bch2_run_thread_with_stdout(struct thread_with_stdio *, - void (*exit)(struct thread_with_stdio *), - void (*fn)(struct thread_with_stdio *)); + const struct thread_with_stdio_ops *); int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t); int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t); -- cgit v1.2.3 From 658a1e42ce00b487909cdf91247e69c333b2c37a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sat, 10 Feb 2024 11:32:20 -0800 Subject: bcachefs: thread_with_file: allow ioctls against these files Make it so that a thread_with_stdio user can handle ioctls against the file descriptor. Signed-off-by: Darrick J. Wong Signed-off-by: Kent Overstreet --- fs/bcachefs/thread_with_file.c | 12 ++++++++++++ fs/bcachefs/thread_with_file.h | 1 + 2 files changed, 13 insertions(+) diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index d298a8e8d2b9..d61ede0bda40 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -247,12 +247,23 @@ static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_str return mask; } +static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + + if (thr->ops->unlocked_ioctl) + return thr->ops->unlocked_ioctl(thr, cmd, p); + return -ENOTTY; +} + static const struct file_operations thread_with_stdio_fops = { .llseek = no_llseek, .read = thread_with_stdio_read, .write = thread_with_stdio_write, .poll = thread_with_stdio_poll, .release = thread_with_stdio_release, + .unlocked_ioctl = thread_with_stdio_ioctl, }; static const struct file_operations thread_with_stdout_fops = { @@ -260,6 +271,7 @@ static const struct file_operations thread_with_stdout_fops = { .read = thread_with_stdio_read, .poll = thread_with_stdout_poll, .release = thread_with_stdio_release, + .unlocked_ioctl = thread_with_stdio_ioctl, }; static int thread_with_stdio_fn(void *arg) diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h index 5361611edb4c..f0b8c04ed4a4 100644 --- a/fs/bcachefs/thread_with_file.h +++ b/fs/bcachefs/thread_with_file.h @@ -54,6 +54,7 @@ struct thread_with_stdio; struct thread_with_stdio_ops { void (*exit)(struct thread_with_stdio *); void (*fn)(struct thread_with_stdio *); + long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long); }; struct thread_with_stdio { -- cgit v1.2.3 From 6b33312925a77c24905cef1356c2b63a6149d8a1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 13 Feb 2024 20:26:09 -0500 Subject: bcachefs: thread_with_file: Fix missing va_end() Fixes: https://lore.kernel.org/linux-bcachefs/202402131603.E953E2CF@keescook/T/#u Reported-by: coverity scan Signed-off-by: Kent Overstreet --- fs/bcachefs/thread_with_file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index d61ede0bda40..60d701f33f85 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -384,6 +384,8 @@ static ssize_t bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_copy(args2, args); len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2); + va_end(args2); + if (len + 1 <= darray_room(*out)) { out->nr += len; return len; -- cgit v1.2.3 From da23795e4c3ae0efd701e81b54c5c42d4b6f37f4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 17 Feb 2024 20:49:11 -0500 Subject: bcachefs: thread_with_file: add f_ops.flush Add a flush op, to return the exit code via close(). Also update bcachefs usage to use this to return fsck exit codes. Signed-off-by: Kent Overstreet --- fs/bcachefs/chardev.c | 25 ++++++++++++++++++++----- fs/bcachefs/thread_with_file.c | 12 +++++++++++- fs/bcachefs/thread_with_file.h | 2 +- 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index b1a460729a4b..b584d78cb39c 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -155,14 +155,28 @@ static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) kfree(thr); } -static void bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) +static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) { struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts); - thr->thr.thr.ret = PTR_ERR_OR_ZERO(c); - if (!thr->thr.thr.ret) - bch2_fs_stop(c); + if (IS_ERR(c)) + return PTR_ERR(c); + + int ret = 0; + if (test_bit(BCH_FS_errors_fixed, &c->flags)) + ret |= 1; + if (test_bit(BCH_FS_error, &c->flags)) + ret |= 4; + + bch2_fs_stop(c); + + if (ret & 1) + bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); + if (ret & 4) + bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); + + return ret; } static const struct thread_with_stdio_ops bch2_offline_fsck_ops = { @@ -763,7 +777,7 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, return ret; } -static void bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) +static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) { struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); struct bch_fs *c = thr->c; @@ -795,6 +809,7 @@ static void bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) up(&c->online_fsck_mutex); bch2_ro_ref_put(c); + return ret; } static const struct thread_with_stdio_ops bch2_online_fsck_ops = { diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index 60d701f33f85..940db15d6a93 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -247,6 +247,14 @@ static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_str return mask; } +static int thread_with_stdio_flush(struct file *file, fl_owner_t id) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + + return thr->thr.ret; +} + static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p) { struct thread_with_stdio *thr = @@ -262,6 +270,7 @@ static const struct file_operations thread_with_stdio_fops = { .read = thread_with_stdio_read, .write = thread_with_stdio_write, .poll = thread_with_stdio_poll, + .flush = thread_with_stdio_flush, .release = thread_with_stdio_release, .unlocked_ioctl = thread_with_stdio_ioctl, }; @@ -270,6 +279,7 @@ static const struct file_operations thread_with_stdout_fops = { .llseek = no_llseek, .read = thread_with_stdio_read, .poll = thread_with_stdout_poll, + .flush = thread_with_stdio_flush, .release = thread_with_stdio_release, .unlocked_ioctl = thread_with_stdio_ioctl, }; @@ -278,7 +288,7 @@ static int thread_with_stdio_fn(void *arg) { struct thread_with_stdio *thr = arg; - thr->ops->fn(thr); + thr->thr.ret = thr->ops->fn(thr); thread_with_stdio_done(thr); return 0; diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h index f0b8c04ed4a4..af54ea8f5b0f 100644 --- a/fs/bcachefs/thread_with_file.h +++ b/fs/bcachefs/thread_with_file.h @@ -53,7 +53,7 @@ struct thread_with_stdio; struct thread_with_stdio_ops { void (*exit)(struct thread_with_stdio *); - void (*fn)(struct thread_with_stdio *); + int (*fn)(struct thread_with_stdio *); long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long); }; -- cgit v1.2.3 From 52946d828aac5bc8e20665a8434f87740fad03e3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 6 Feb 2024 17:24:18 -0500 Subject: bcachefs: Kill more -EIO error codes This converts -EIOs related to btree node errors to private error codes, which will help with some ongoing debugging by giving us better error messages. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 3 +-- fs/bcachefs/btree_cache.c | 6 +++--- fs/bcachefs/btree_gc.c | 4 ++-- fs/bcachefs/btree_io.c | 7 +++---- fs/bcachefs/btree_iter.c | 2 +- fs/bcachefs/btree_types.h | 2 +- fs/bcachefs/btree_update_interior.c | 3 +-- fs/bcachefs/errcode.h | 6 +++++- fs/bcachefs/error.c | 10 ++++++++-- fs/bcachefs/error.h | 2 +- fs/bcachefs/recovery.c | 2 +- 11 files changed, 27 insertions(+), 20 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 569b97904da4..f9ccefc74c49 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -131,8 +131,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, printbuf_exit(&buf); if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { - bch2_inconsistent_error(c); - return -EIO; + return bch2_inconsistent_error(c) ? BCH_ERR_erofs_unfixed_errors : 0; } else { return 0; } diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 72d24933dc19..e8665258360e 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -905,7 +905,7 @@ retry: if (unlikely(btree_node_read_error(b))) { six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-EIO); + return ERR_PTR(-BCH_ERR_btree_node_read_error); } EBUG_ON(b->c.btree_id != path->btree_id); @@ -996,7 +996,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * if (unlikely(btree_node_read_error(b))) { six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-EIO); + return ERR_PTR(-BCH_ERR_btree_node_read_error); } EBUG_ON(b->c.btree_id != path->btree_id); @@ -1079,7 +1079,7 @@ lock_node: if (unlikely(btree_node_read_error(b))) { six_unlock_read(&b->c.lock); - b = ERR_PTR(-EIO); + b = ERR_PTR(-BCH_ERR_btree_node_read_error); goto out; } diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index f4872f1d6fc6..b7085e996c44 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -407,7 +407,7 @@ again: printbuf_reset(&buf); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); - if (mustfix_fsck_err_on(ret == -EIO, c, + if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), c, btree_node_unreadable, "Topology repair: unreadable btree node at btree %s level %u:\n" " %s", @@ -979,7 +979,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b false); ret = PTR_ERR_OR_ZERO(child); - if (ret == -EIO) { + if (bch2_err_matches(ret, EIO)) { bch2_topology_error(c); if (__fsck_err(c, diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 767cdbe7f586..624c8287deb4 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -581,8 +581,7 @@ static int __btree_err(int ret, break; case -BCH_ERR_btree_node_read_err_bad_node: bch2_print_string_as_lines(KERN_ERR, out.buf); - bch2_topology_error(c); - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO; + ret = bch2_topology_error(c); break; case -BCH_ERR_btree_node_read_err_incompatible: bch2_print_string_as_lines(KERN_ERR, out.buf); @@ -1746,7 +1745,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, list_move(&b->list, &c->btree_cache.freeable); mutex_unlock(&c->btree_cache.lock); - ret = -EIO; + ret = -BCH_ERR_btree_node_read_error; goto err; } @@ -1850,7 +1849,7 @@ static void btree_node_write_work(struct work_struct *work) bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) { - ret = -BCH_ERR_btree_write_all_failed; + ret = -BCH_ERR_btree_node_write_all_failed; goto err; } diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 8f185a98f96f..4cb43c676ee1 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2307,7 +2307,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) btree_iter_path(trans, iter)->level); if (iter->flags & BTREE_ITER_WITH_JOURNAL) - return bkey_s_c_err(-EIO); + return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported); bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 4a5a64499eb7..94f996ef5d2b 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -727,7 +727,7 @@ struct btree_root { __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); u8 level; u8 alive; - s8 error; + s16 error; }; enum btree_gc_coalesce_fail_reason { diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 8b0291cfa872..fd0bbce11765 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1894,8 +1894,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, __func__, buf1.buf, buf2.buf); printbuf_exit(&buf1); printbuf_exit(&buf2); - bch2_topology_error(c); - ret = -EIO; + ret = bch2_topology_error(c); goto err; } diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 3fd33b307a77..a82a9d754fda 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -178,6 +178,7 @@ x(EINVAL, opt_parse_error) \ x(EINVAL, remove_with_metadata_missing_unimplemented)\ x(EINVAL, remove_would_lose_data) \ + x(EINVAL, btree_iter_with_journal_not_supported) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ x(EROFS, erofs_journal_err) \ @@ -227,7 +228,10 @@ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ x(EIO, btree_node_read_err) \ x(EIO, sb_not_downgraded) \ - x(EIO, btree_write_all_failed) \ + x(EIO, btree_node_write_all_failed) \ + x(EIO, btree_node_read_error) \ + x(EIO, btree_node_read_validate_error) \ + x(EIO, btree_need_topology_repair) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index d32c8bebe46c..043431206799 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "error.h" +#include "recovery.h" #include "super.h" #include "thread_with_file.h" @@ -25,11 +26,16 @@ bool bch2_inconsistent_error(struct bch_fs *c) } } -void bch2_topology_error(struct bch_fs *c) +int bch2_topology_error(struct bch_fs *c) { set_bit(BCH_FS_topology_error, &c->flags); - if (!test_bit(BCH_FS_fsck_running, &c->flags)) + if (!test_bit(BCH_FS_fsck_running, &c->flags)) { bch2_inconsistent_error(c); + return -BCH_ERR_btree_need_topology_repair; + } else { + return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: + -BCH_ERR_btree_node_read_validate_error; + } } void bch2_fatal_error(struct bch_fs *c) diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index fec17d1353d1..94491190e09e 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -30,7 +30,7 @@ struct work_struct; bool bch2_inconsistent_error(struct bch_fs *); -void bch2_topology_error(struct bch_fs *); +int bch2_topology_error(struct bch_fs *); #define bch2_fs_inconsistent(c, ...) \ ({ \ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 39271d2d63d1..83f95d22fa81 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -275,7 +275,7 @@ static int journal_replay_entry_early(struct bch_fs *c, bkey_copy(&r->key, (struct bkey_i *) entry->start); r->error = 0; } else { - r->error = -EIO; + r->error = -BCH_ERR_btree_node_read_error; } r->alive = true; break; -- cgit v1.2.3 From 4c20278eb18ac22691aea6d9ecc8d98acffafd0d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 5 Feb 2024 22:20:12 -0500 Subject: bcachefs: Check subvol <-> inode pointers in check_subvol() Subvolumes and subvolume root inodes point to each other: this verifies the subvolume -> inode -> subvolme path. Signed-off-by: Kent Overstreet --- fs/bcachefs/inode.c | 2 +- fs/bcachefs/inode.h | 2 ++ fs/bcachefs/subvolume.c | 31 +++++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index ee298a47425f..418d731b47d2 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -324,7 +324,7 @@ int bch2_inode_unpack(struct bkey_s_c k, return bch2_inode_unpack_slowpath(k, unpacked); } -static int bch2_inode_peek_nowarn(struct btree_trans *trans, +int bch2_inode_peek_nowarn(struct btree_trans *trans, struct btree_iter *iter, struct bch_inode_unpacked *inode, subvol_inum inum, unsigned flags) diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index b8da7ff8069d..9a9353c001c2 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -95,6 +95,8 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *); void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); +int bch2_inode_peek_nowarn(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *, subvol_inum, unsigned); int bch2_inode_peek(struct btree_trans *, struct btree_iter *, struct bch_inode_unpacked *, subvol_inum, unsigned); diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 7c67c28d3ef8..e7ee52c39990 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -42,6 +42,36 @@ static int check_subvol(struct btree_trans *trans, return ret ?: -BCH_ERR_transaction_restart_nested; } + struct bch_inode_unpacked inode; + struct btree_iter inode_iter = {}; + ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode, + (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) }, + 0); + bch2_trans_iter_exit(trans, &inode_iter); + + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (fsck_err_on(ret, c, subvol_to_missing_root, + "subvolume %llu points to missing subvolume root %llu:%u", + k.k->p.offset, le64_to_cpu(subvol.v->inode), + le32_to_cpu(subvol.v->snapshot))) { + ret = bch2_subvolume_delete(trans, iter->pos.offset); + bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); + return ret ?: -BCH_ERR_transaction_restart_nested; + } + + if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset, + c, subvol_root_wrong_bi_subvol, + "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu", + inode.bi_inum, inode_iter.k.p.snapshot, + inode.bi_subvol, subvol.k->p.offset)) { + inode.bi_subvol = subvol.k->p.offset; + ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot)); + if (ret) + goto err; + } + if (!BCH_SUBVOLUME_SNAP(subvol.v)) { u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot)); u32 snapshot_tree; @@ -73,6 +103,7 @@ static int check_subvol(struct btree_trans *trans, } } +err: fsck_err: return ret; } -- cgit v1.2.3 From f2b02d099cc781df340ed1f8b058dcb504b73a26 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 5 Feb 2024 22:30:51 -0500 Subject: bcachefs: Check subvol <-> inode pointers in check_inode() Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 24f8657370ce..120be62331c9 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -920,6 +920,31 @@ static int check_inode(struct btree_trans *trans, do_update = true; } + if (u.bi_subvol) { + struct bch_subvolume s; + + ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + + if (fsck_err_on(ret, + c, inode_bi_subvol_missing, + "inode %llu:%u bi_subvol points to missing subvolume %u", + u.bi_inum, k.k->p.snapshot, u.bi_subvol) || + fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum || + !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot), + k.k->p.snapshot), + c, inode_bi_subvol_wrong, + "inode %llu:%u points to subvol %u, but subvol points to %llu:%u", + u.bi_inum, k.k->p.snapshot, u.bi_subvol, + le64_to_cpu(s.inode), + le32_to_cpu(s.snapshot))) { + u.bi_subvol = 0; + u.bi_parent_subvol = 0; + do_update = true; + } + } + if (do_update) { ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot); bch_err_msg(c, ret, "in fsck updating inode"); -- cgit v1.2.3 From 0b17618fdc90fd4ed558aa9a4028879afd4f09e2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 6 Feb 2024 23:39:08 -0500 Subject: bcachefs: check_inode_dirent_inode() check that if an inode has a backpointer, the dirent it points to points back to it. We do this in check_dirent_inode_dirent(), but only for inodes that have dirents that point to them - we also have to do the check starting from the inode to catch inodes that don't have dirents that point to them. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 125 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 89 insertions(+), 36 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 120be62331c9..830b06d1038c 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -767,6 +767,43 @@ fsck_err: goto out; } +static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos pos) +{ + return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); +} + +static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode, + u32 *snapshot) +{ + if (inode->bi_subvol) { + u64 inum; + int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum); + if (ret) + return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) }); + } + + return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot)); +} + +static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, + struct bkey_s_c_dirent d) +{ + return inode->bi_dir == d.k->p.inode && + inode->bi_dir_offset == d.k->p.offset; +} + +static bool dirent_points_to_inode(struct bkey_s_c_dirent d, + struct bch_inode_unpacked *inode) +{ + return d.v->d_type == DT_SUBVOL + ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol + : le64_to_cpu(d.v->d_inum) == inode->bi_inum; +} + static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) { struct btree_iter iter; @@ -776,6 +813,49 @@ static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) return ret; } +static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k, + struct bch_inode_unpacked *inode, + u32 inode_snapshot, bool *write_inode) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + + struct btree_iter dirent_iter = {}; + struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot); + int ret = bkey_err(d); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (fsck_err_on(ret, + c, inode_points_to_missing_dirent, + "inode points to missing dirent\n%s", + (bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) || + fsck_err_on(!ret && !dirent_points_to_inode(d, inode), + c, inode_points_to_wrong_dirent, + "inode points to dirent that does not point back:\n%s", + (bch2_bkey_val_to_text(&buf, c, inode_k), + prt_newline(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { + /* + * We just clear the backpointer fields for now. If we find a + * dirent that points to this inode in check_dirents(), we'll + * update it then; then when we get to check_path() if the + * backpointer is still 0 we'll reattach it. + */ + inode->bi_dir = 0; + inode->bi_dir_offset = 0; + inode->bi_flags &= ~BCH_INODE_backptr_untrusted; + *write_inode = true; + } + + ret = 0; +fsck_err: + bch2_trans_iter_exit(trans, &dirent_iter); + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} + static int check_inode(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -920,6 +1000,12 @@ static int check_inode(struct btree_trans *trans, do_update = true; } + if (u.bi_dir || u.bi_dir_offset) { + ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update); + if (ret) + goto err; + } + if (u.bi_subvol) { struct bch_subvolume s; @@ -977,28 +1063,6 @@ int bch2_check_inodes(struct bch_fs *c) return ret; } -static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos pos) -{ - return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); -} - -static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, - struct bkey_s_c_dirent d) -{ - return inode->bi_dir == d.k->p.inode && - inode->bi_dir_offset == d.k->p.offset; -} - -static bool dirent_points_to_inode(struct bkey_s_c_dirent d, - struct bch_inode_unpacked *inode) -{ - return d.v->d_type == DT_SUBVOL - ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol - : le64_to_cpu(d.v->d_inum) == inode->bi_inum; -} - static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) { struct bch_fs *c = trans->c; @@ -1486,7 +1550,7 @@ fsck_err: return ret ?: trans_was_restarted(trans, restart_count); } -static int check_inode_backpointer(struct btree_trans *trans, +static int check_dirent_inode_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c_dirent d, struct bch_inode_unpacked *target, @@ -1585,7 +1649,7 @@ static int check_dirent_target(struct btree_trans *trans, struct printbuf buf = PRINTBUF; int ret = 0; - ret = check_inode_backpointer(trans, iter, d, target, target_snapshot); + ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot); if (ret) goto err; @@ -2011,18 +2075,7 @@ static int check_path(struct btree_trans *trans, struct bkey_s_c_dirent d; u32 parent_snapshot = snapshot; - if (inode->bi_subvol) { - u64 inum; - - ret = subvol_lookup(trans, inode->bi_parent_subvol, - &parent_snapshot, &inum); - if (ret) - break; - } - - d = dirent_get_by_pos(trans, &dirent_iter, - SPOS(inode->bi_dir, inode->bi_dir_offset, - parent_snapshot)); + d = inode_get_dirent(trans, &dirent_iter, inode, &parent_snapshot); ret = bkey_err(d.s_c); if (ret && !bch2_err_matches(ret, ENOENT)) break; -- cgit v1.2.3 From 971a1503a2220f7e3e4a29e1778949ce84a4ac03 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 8 Feb 2024 16:02:08 -0500 Subject: bcachefs: better log message in lookup_inode_for_snapshot() Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 830b06d1038c..5b059b4fff8c 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -564,13 +564,12 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, } static struct inode_walker_entry * -lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, - u32 snapshot, bool is_whiteout) +lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k) { - struct inode_walker_entry *i; - - snapshot = bch2_snapshot_equiv(c, snapshot); + bool is_whiteout = k.k->type == KEY_TYPE_whiteout; + u32 snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); + struct inode_walker_entry *i; __darray_for_each(w->inodes, i) if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot)) goto found; @@ -581,20 +580,24 @@ found: if (snapshot != i->snapshot && !is_whiteout) { struct inode_walker_entry new = *i; - size_t pos; - int ret; new.snapshot = snapshot; new.count = 0; - bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u", - w->last_pos.inode, snapshot, i->snapshot); + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, k); + + bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" + "unexpected because we should always update the inode when we update a key in that inode\n" + "%s", + w->last_pos.inode, snapshot, i->snapshot, buf.buf); + printbuf_exit(&buf); while (i > w->inodes.data && i[-1].snapshot > snapshot) --i; - pos = i - w->inodes.data; - ret = darray_insert_item(&w->inodes, pos, new); + size_t pos = i - w->inodes.data; + int ret = darray_insert_item(&w->inodes, pos, new); if (ret) return ERR_PTR(ret); @@ -605,21 +608,21 @@ found: } static struct inode_walker_entry *walk_inode(struct btree_trans *trans, - struct inode_walker *w, struct bpos pos, - bool is_whiteout) + struct inode_walker *w, + struct bkey_s_c k) { - if (w->last_pos.inode != pos.inode) { - int ret = get_inodes_all_snapshots(trans, w, pos.inode); + if (w->last_pos.inode != k.k->p.inode) { + int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode); if (ret) return ERR_PTR(ret); - } else if (bkey_cmp(w->last_pos, pos)) { + } else if (bkey_cmp(w->last_pos, k.k->p)) { darray_for_each(w->inodes, i) i->seen_this_pos = false; } - w->last_pos = pos; + w->last_pos = k.k->p; - return lookup_inode_for_snapshot(trans->c, w, pos.snapshot, is_whiteout); + return lookup_inode_for_snapshot(trans->c, w, k); } static int __get_visible_inodes(struct btree_trans *trans, @@ -1371,7 +1374,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, goto err; } - i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout); + i = walk_inode(trans, inode, k); ret = PTR_ERR_OR_ZERO(i); if (ret) goto err; @@ -1792,7 +1795,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, BUG_ON(!btree_iter_path(trans, iter)->should_be_locked); - i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout); + i = walk_inode(trans, dir, k); ret = PTR_ERR_OR_ZERO(i); if (ret < 0) goto err; @@ -1919,7 +1922,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, if (ret) return ret; - i = walk_inode(trans, inode, k.k->p, k.k->type == KEY_TYPE_whiteout); + i = walk_inode(trans, inode, k); ret = PTR_ERR_OR_ZERO(i); if (ret) return ret; -- cgit v1.2.3 From 0b498a5a3960e8a9a3411c12fad77ef769ed3c1e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 6 Feb 2024 23:41:46 -0500 Subject: bcachefs: check bi_parent_subvol in check_inode() check for inodes with a nonzero bi_parent_subvol field that aren't actually subvolume roots Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 5b059b4fff8c..cb20a3d3f44a 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1009,6 +1009,16 @@ static int check_inode(struct btree_trans *trans, goto err; } + if (fsck_err_on(u.bi_parent_subvol && + (u.bi_subvol == 0 || + u.bi_subvol == BCACHEFS_ROOT_SUBVOL), + c, inode_bi_parent_subvol_nonzero, + "inode %llu:%u has subvol %u but nonzero parent subvol %u", + u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) { + u.bi_parent_subvol = 0; + do_update = true; + } + if (u.bi_subvol) { struct bch_subvolume s; -- cgit v1.2.3 From e539ebb8674c74ab26d707965bcca2fe3fc17b43 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 6 Feb 2024 23:51:23 -0500 Subject: bcachefs: simplify check_dirent_inode_dirent() Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 114 ++++++++++++++++++++++++++--------------------------- 1 file changed, 56 insertions(+), 58 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index cb20a3d3f44a..fa96f63492d1 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1570,77 +1570,75 @@ static int check_dirent_inode_dirent(struct btree_trans *trans, u32 target_snapshot) { struct bch_fs *c = trans->c; - struct btree_iter bp_iter = { NULL }; struct printbuf buf = PRINTBUF; int ret = 0; + if (inode_points_to_dirent(target, d)) + return 0; + if (!target->bi_dir && !target->bi_dir_offset) { target->bi_dir = d.k->p.inode; target->bi_dir_offset = d.k->p.offset; - - ret = __bch2_fsck_write_inode(trans, target, target_snapshot); - if (ret) - goto err; + return __bch2_fsck_write_inode(trans, target, target_snapshot); } - if (!inode_points_to_dirent(target, d)) { - struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, - SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot)); - ret = bkey_err(bp_dirent); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - bool backpointer_exists = !ret; - ret = 0; + struct btree_iter bp_iter = { NULL }; + struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, + SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot)); + ret = bkey_err(bp_dirent); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; - bch2_bkey_val_to_text(&buf, c, d.s_c); - prt_newline(&buf); - if (backpointer_exists) - bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); + bool backpointer_exists = !ret; + ret = 0; - if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists, - c, inode_dir_multiple_links, - "directory %llu:%u with multiple links\n%s", - target->bi_inum, target_snapshot, buf.buf)) { - ret = __remove_dirent(trans, d.k->p); - goto out; - } + if (fsck_err_on(!backpointer_exists, + c, inode_wrong_backpointer, + "inode %llu:%u has wrong backpointer:\n" + "got %llu:%llu\n" + "should be %llu:%llu", + target->bi_inum, target_snapshot, + target->bi_dir, + target->bi_dir_offset, + d.k->p.inode, + d.k->p.offset)) { + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + ret = __bch2_fsck_write_inode(trans, target, target_snapshot); + goto out; + } - /* - * hardlinked file with nlink 0: - * We're just adjusting nlink here so check_nlinks() will pick - * it up, it ignores inodes with nlink 0 - */ - if (fsck_err_on(backpointer_exists && !target->bi_nlink, - c, inode_multiple_links_but_nlink_0, - "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", - target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { - target->bi_nlink++; - target->bi_flags &= ~BCH_INODE_unlinked; - - ret = __bch2_fsck_write_inode(trans, target, target_snapshot); - if (ret) - goto err; - } + bch2_bkey_val_to_text(&buf, c, d.s_c); + prt_newline(&buf); + if (backpointer_exists) + bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); + + if (fsck_err_on(backpointer_exists && + (S_ISDIR(target->bi_mode) || + target->bi_subvol), + c, inode_dir_multiple_links, + "%s %llu:%u with multiple links\n%s", + S_ISDIR(target->bi_mode) ? "directory" : "subvolume", + target->bi_inum, target_snapshot, buf.buf)) { + ret = __remove_dirent(trans, d.k->p); + goto out; + } - if (fsck_err_on(!backpointer_exists, - c, inode_wrong_backpointer, - "inode %llu:%u has wrong backpointer:\n" - "got %llu:%llu\n" - "should be %llu:%llu", - target->bi_inum, target_snapshot, - target->bi_dir, - target->bi_dir_offset, - d.k->p.inode, - d.k->p.offset)) { - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - - ret = __bch2_fsck_write_inode(trans, target, target_snapshot); - if (ret) - goto err; - } + /* + * hardlinked file with nlink 0: + * We're just adjusting nlink here so check_nlinks() will pick + * it up, it ignores inodes with nlink 0 + */ + if (fsck_err_on(backpointer_exists && !target->bi_nlink, + c, inode_multiple_links_but_nlink_0, + "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", + target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { + target->bi_nlink++; + target->bi_flags &= ~BCH_INODE_unlinked; + ret = __bch2_fsck_write_inode(trans, target, target_snapshot); + if (ret) + goto err; } out: err: -- cgit v1.2.3 From ea27001e14e963547df6ae60edc70332fcd02c37 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 7 Feb 2024 00:06:14 -0500 Subject: bcachefs: delete duplicated checks in check_dirent_to_subvol() these were already checked in check_subvol() Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index fa96f63492d1..9069701028a9 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1713,8 +1713,8 @@ fsck_err: return ret; } -static int check_subvol_dirent(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c_dirent d) +static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c_dirent d) { struct bch_fs *c = trans->c; struct bch_inode_unpacked subvol_root; @@ -1724,7 +1724,7 @@ static int check_subvol_dirent(struct btree_trans *trans, struct btree_iter *ite int ret = 0; ret = subvol_lookup(trans, target_subvol, - &target_snapshot, &target_inum); + &target_snapshot, &target_inum); if (ret && !bch2_err_matches(ret, ENOENT)) return ret; @@ -1738,25 +1738,6 @@ static int check_subvol_dirent(struct btree_trans *trans, struct btree_iter *ite if (ret && !bch2_err_matches(ret, ENOENT)) return ret; - if (fsck_err_on(ret, c, subvol_to_missing_root, - "subvolume %u points to missing subvolume root %llu", - target_subvol, - target_inum)) { - bch_err(c, "repair not implemented yet"); - return -EINVAL; - } - - if (fsck_err_on(subvol_root.bi_subvol != target_subvol, - c, subvol_root_wrong_bi_subvol, - "subvol root %llu has wrong bi_subvol field: got %u, should be %u", - target_inum, - subvol_root.bi_subvol, target_subvol)) { - subvol_root.bi_subvol = target_subvol; - ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot); - if (ret) - return ret; - } - ret = check_dirent_target(trans, iter, d, &subvol_root, target_snapshot); if (ret) @@ -1849,7 +1830,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, d = bkey_s_c_to_dirent(k); if (d.v->d_type == DT_SUBVOL) { - ret = check_subvol_dirent(trans, iter, d); + ret = check_dirent_to_subvol(trans, iter, d); if (ret) goto err; } else { -- cgit v1.2.3 From f4e68c859f4e335f0b07dfcc6703cfe501265deb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 7 Feb 2024 00:23:25 -0500 Subject: bcachefs: check inode->bi_parent_subvol against dirent Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 35 +++++++++++++---------------------- fs/bcachefs/sb-errors_types.h | 2 +- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 9069701028a9..fafce6bffff2 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1012,7 +1012,7 @@ static int check_inode(struct btree_trans *trans, if (fsck_err_on(u.bi_parent_subvol && (u.bi_subvol == 0 || u.bi_subvol == BCACHEFS_ROOT_SUBVOL), - c, inode_bi_parent_subvol_nonzero, + c, inode_bi_parent_nonzero, "inode %llu:%u has subvol %u but nonzero parent subvol %u", u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) { u.bi_parent_subvol = 0; @@ -1685,27 +1685,6 @@ static int check_dirent_target(struct btree_trans *trans, d = dirent_i_to_s_c(n); } - - if (fsck_err_on(d.v->d_type == DT_SUBVOL && - target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol), - c, dirent_d_parent_subvol_wrong, - "dirent has wrong d_parent_subvol field: got %u, should be %u", - le32_to_cpu(d.v->d_parent_subvol), - target->bi_parent_subvol)) { - n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - bkey_reassemble(&n->k_i, d.s_c); - n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); - - ret = bch2_trans_update(trans, iter, &n->k_i, 0); - if (ret) - goto err; - - d = dirent_i_to_s_c(n); - } err: fsck_err: printbuf_exit(&buf); @@ -1718,6 +1697,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * { struct bch_fs *c = trans->c; struct bch_inode_unpacked subvol_root; + u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); u32 target_snapshot; u64 target_inum; @@ -1738,6 +1718,17 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * if (ret && !bch2_err_matches(ret, ENOENT)) return ret; + if (fsck_err_on(parent_subvol != subvol_root.bi_parent_subvol, + c, inode_bi_parent_wrong, + "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u", + target_inum, + subvol_root.bi_parent_subvol, parent_subvol)) { + subvol_root.bi_parent_subvol = parent_subvol; + ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot); + if (ret) + return ret; + } + ret = check_dirent_target(trans, iter, d, &subvol_root, target_snapshot); if (ret) diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 2c998e721d90..5178bf579f7c 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -231,7 +231,7 @@ x(dirent_name_dot_or_dotdot, 223) \ x(dirent_name_has_slash, 224) \ x(dirent_d_type_wrong, 225) \ - x(dirent_d_parent_subvol_wrong, 226) \ + x(inode_bi_parent_wrong, 226) \ x(dirent_in_missing_dir_inode, 227) \ x(dirent_in_non_dir_inode, 228) \ x(dirent_to_missing_inode, 229) \ -- cgit v1.2.3 From c60b7f803c8b7dbea75b3cbfd23d788bab7215a6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 7 Feb 2024 00:45:09 -0500 Subject: bcachefs: check dirent->d_parent_subvol Check that d_parent_subvol makes sense - the dirent's snapshot must be visible in d_parent_subvol (i.e. an ancestor of d_parent_subvol's snapshot) in order to be visible. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 57 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index fafce6bffff2..ff7347cc395f 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1692,6 +1692,31 @@ fsck_err: return ret; } +/* find a subvolume that's a descendent of @snapshot: */ +static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_subvolume) + continue; + + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); + if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) { + bch2_trans_iter_exit(trans, &iter); + *subvolid = k.k->p.offset; + goto found; + } + } + if (!ret) + ret = -ENOENT; +found: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c_dirent d) { @@ -1699,18 +1724,44 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * struct bch_inode_unpacked subvol_root; u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); - u32 target_snapshot; - u64 target_inum; + u32 target_snapshot, parent_snapshot; + u64 target_inum, parent_inum; + struct printbuf buf = PRINTBUF; int ret = 0; + ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol, + "dirent parent_subvol points to missing subvolume\n%s", + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) || + fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot), + c, dirent_not_visible_in_parent_subvol, + "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s", + parent_snapshot, + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { + u32 new_parent_subvol; + ret = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); + if (ret) + goto err; + + struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); + ret = PTR_ERR_OR_ZERO(new_dirent); + if (ret) + goto err; + + new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol); + } + ret = subvol_lookup(trans, target_subvol, &target_snapshot, &target_inum); if (ret && !bch2_err_matches(ret, ENOENT)) return ret; if (fsck_err_on(ret, c, dirent_to_missing_subvol, - "dirent points to missing subvolume %u", - le32_to_cpu(d.v->d_child_subvol))) + "dirent points to missing subvolume\n%s", + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) return __remove_dirent(trans, d.k->p); ret = lookup_inode(trans, target_inum, @@ -1733,7 +1784,9 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * target_snapshot); if (ret) return ret; +err: fsck_err: + printbuf_exit(&buf); return ret; } -- cgit v1.2.3 From 45b4ed525e3c5b5dcdee65ae23a27b9efe595714 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 21 Jan 2024 14:57:58 -0500 Subject: bcachefs: Repair subvol dirents that point to non subvols when repair switches d_type to or from DT_SUBVOL, we need to update the target accordingly Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index ff7347cc395f..0d405b960a7c 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1678,6 +1678,12 @@ static int check_dirent_target(struct btree_trans *trans, bkey_reassemble(&n->k_i, d.s_c); n->v.d_type = inode_d_type(target); + if (n->v.d_type == DT_SUBVOL) { + n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); + n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); + } else { + n->v.d_inum = cpu_to_le64(target->bi_inum); + } ret = bch2_trans_update(trans, iter, &n->k_i, 0); if (ret) -- cgit v1.2.3 From 64304aaf4ef3d14c46d711a35a7b78bb5790350a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 22 Jan 2024 15:12:28 -0500 Subject: bcachefs: bch_subvolume::parent -> creation_parent bit of renaming, prep for adding a fs path parent Signed-off-by: Kent Overstreet --- fs/bcachefs/subvolume.c | 24 ++++++++++++------------ fs/bcachefs/subvolume_format.h | 2 +- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index e7ee52c39990..a0be103b48fe 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -143,8 +143,8 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, le64_to_cpu(s.v->inode), le32_to_cpu(s.v->snapshot)); - if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, parent)) - prt_printf(out, " parent %u", le32_to_cpu(s.v->parent)); + if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent)) + prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent)); } static __always_inline int @@ -228,8 +228,8 @@ static int bch2_subvolume_reparent(struct btree_trans *trans, if (k.k->type != KEY_TYPE_subvolume) return 0; - if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, parent) && - le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent) + if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, creation_parent) && + le32_to_cpu(bkey_s_c_to_subvolume(k).v->creation_parent) != old_parent) return 0; s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume); @@ -237,7 +237,7 @@ static int bch2_subvolume_reparent(struct btree_trans *trans, if (ret) return ret; - s->v.parent = cpu_to_le32(new_parent); + s->v.creation_parent = cpu_to_le32(new_parent); return 0; } @@ -260,7 +260,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_subvolume_reparent(trans, &iter, k, - subvolid_to_delete, le32_to_cpu(s.parent))); + subvolid_to_delete, le32_to_cpu(s.creation_parent))); } /* @@ -447,12 +447,12 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, if (ret) goto err; - new_subvol->v.flags = 0; - new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); - new_subvol->v.inode = cpu_to_le64(inode); - new_subvol->v.parent = cpu_to_le32(src_subvolid); - new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c)); - new_subvol->v.otime.hi = 0; + new_subvol->v.flags = 0; + new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); + new_subvol->v.inode = cpu_to_le64(inode); + new_subvol->v.creation_parent = cpu_to_le32(src_subvolid); + new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c)); + new_subvol->v.otime.hi = 0; SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro); SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0); diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h index af79134b07d6..b81cf0c6119d 100644 --- a/fs/bcachefs/subvolume_format.h +++ b/fs/bcachefs/subvolume_format.h @@ -19,7 +19,7 @@ struct bch_subvolume { * This is _not_ necessarily the subvolume of the directory containing * this subvolume: */ - __le32 parent; + __le32 creation_parent; __le32 pad; bch_le128 otime; }; -- cgit v1.2.3 From f5d58d0c7212adc02b40dc4b7f6c9ebfde685d1c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 8 Feb 2024 19:52:37 -0500 Subject: bcachefs: Fix path where dirent -> subvol missing and we don't fix Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 0d405b960a7c..27bf07000baa 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1765,10 +1765,14 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * if (ret && !bch2_err_matches(ret, ENOENT)) return ret; - if (fsck_err_on(ret, c, dirent_to_missing_subvol, - "dirent points to missing subvolume\n%s", - (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) - return __remove_dirent(trans, d.k->p); + if (ret) { + if (fsck_err(c, dirent_to_missing_subvol, + "dirent points to missing subvolume\n%s", + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) + return __remove_dirent(trans, d.k->p); + ret = 0; + goto out; + } ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot); @@ -1790,6 +1794,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * target_snapshot); if (ret) return ret; +out: err: fsck_err: printbuf_exit(&buf); -- cgit v1.2.3 From 688a7694097653ca9bd6b7febdf7c88763bbbb92 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 8 Feb 2024 22:52:40 -0500 Subject: bcachefs: Pass inode bkey to check_path() prep work for improving logging/error messages Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 55 +++++++++++++++++++++++++---------------------------- fs/bcachefs/inode.h | 14 ++++++++++++++ 2 files changed, 40 insertions(+), 29 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 27bf07000baa..69c1587536f9 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2105,51 +2105,51 @@ static int path_down(struct bch_fs *c, pathbuf *p, * * XXX: we should also be verifying that inodes are in the right subvolumes */ -static int check_path(struct btree_trans *trans, - pathbuf *p, - struct bch_inode_unpacked *inode, - u32 snapshot) +static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k) { struct bch_fs *c = trans->c; + struct bch_inode_unpacked inode; + u32 snapshot = bch2_snapshot_equiv(c, inode_k.k->p.snapshot); int ret = 0; - snapshot = bch2_snapshot_equiv(c, snapshot); p->nr = 0; - while (!(inode->bi_inum == BCACHEFS_ROOT_INO && - inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) { + BUG_ON(bch2_inode_unpack(inode_k, &inode)); + + while (!(inode.bi_inum == BCACHEFS_ROOT_INO && + inode.bi_subvol == BCACHEFS_ROOT_SUBVOL)) { struct btree_iter dirent_iter; struct bkey_s_c_dirent d; u32 parent_snapshot = snapshot; - d = inode_get_dirent(trans, &dirent_iter, inode, &parent_snapshot); + d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot); ret = bkey_err(d.s_c); if (ret && !bch2_err_matches(ret, ENOENT)) break; - if (!ret && !dirent_points_to_inode(d, inode)) { + if (!ret && !dirent_points_to_inode(d, &inode)) { bch2_trans_iter_exit(trans, &dirent_iter); ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; } if (bch2_err_matches(ret, ENOENT)) { - if (fsck_err(c, inode_unreachable, + if (fsck_err(c, inode_unreachable, "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu", - inode->bi_inum, snapshot, - bch2_d_type_str(inode_d_type(inode)), - inode->bi_nlink, - inode->bi_dir, - inode->bi_dir_offset)) - ret = reattach_inode(trans, inode, snapshot); + inode.bi_inum, snapshot, + bch2_d_type_str(inode_d_type(&inode)), + inode.bi_nlink, + inode.bi_dir, + inode.bi_dir_offset)) + ret = reattach_inode(trans, &inode, snapshot); break; } bch2_trans_iter_exit(trans, &dirent_iter); - if (!S_ISDIR(inode->bi_mode)) + if (!S_ISDIR(inode.bi_mode)) break; - ret = path_down(c, p, inode->bi_inum, snapshot); + ret = path_down(c, p, inode.bi_inum, snapshot); if (ret) { bch_err(c, "memory allocation failure"); return ret; @@ -2157,7 +2157,7 @@ static int check_path(struct btree_trans *trans, snapshot = parent_snapshot; - ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot); + ret = lookup_inode(trans, inode.bi_dir, &inode, &snapshot); if (ret) { /* Should have been caught in dirents pass */ if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -2165,26 +2165,26 @@ static int check_path(struct btree_trans *trans, break; } - if (path_is_dup(p, inode->bi_inum, snapshot)) { + if (path_is_dup(p, inode.bi_inum, snapshot)) { /* XXX print path */ bch_err(c, "directory structure loop"); darray_for_each(*p, i) pr_err("%llu:%u", i->inum, i->snapshot); - pr_err("%llu:%u", inode->bi_inum, snapshot); + pr_err("%llu:%u", inode.bi_inum, snapshot); if (!fsck_err(c, dir_loop, "directory structure loop")) return 0; - ret = remove_backpointer(trans, inode); + ret = remove_backpointer(trans, &inode); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) bch_err_msg(c, ret, "removing dirent"); if (ret) break; - ret = reattach_inode(trans, inode, snapshot); + ret = reattach_inode(trans, &inode, snapshot); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err_msg(c, ret, "reattaching inode %llu", inode->bi_inum); + bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); break; } } @@ -2200,7 +2200,6 @@ fsck_err: */ int bch2_check_directory_structure(struct bch_fs *c) { - struct bch_inode_unpacked u; pathbuf path = { 0, }; int ret; @@ -2213,12 +2212,10 @@ int bch2_check_directory_structure(struct bch_fs *c) if (!bkey_is_inode(k.k)) continue; - BUG_ON(bch2_inode_unpack(k, &u)); - - if (u.bi_flags & BCH_INODE_unlinked) + if (bch2_inode_flags(k) & BCH_INODE_unlinked) continue; - check_path(trans, &path, &u, iter.pos.snapshot); + check_path(trans, &path, k); }))); darray_exit(&path); diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 9a9353c001c2..056298050550 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -177,6 +177,20 @@ static inline u8 inode_d_type(struct bch_inode_unpacked *inode) return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode); } +static inline u32 bch2_inode_flags(struct bkey_s_c k) +{ + switch (k.k->type) { + case KEY_TYPE_inode: + return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags); + case KEY_TYPE_inode_v2: + return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags); + case KEY_TYPE_inode_v3: + return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags); + default: + return 0; + } +} + /* i_nlink: */ static inline unsigned nlink_bias(umode_t mode) -- cgit v1.2.3 From 3a136177f3a7929ca1ae3712267548ac4a668cf8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 8 Feb 2024 23:08:21 -0500 Subject: bcachefs: check_path() now prints full inode when reattaching Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 69c1587536f9..412a196263ea 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2108,7 +2108,9 @@ static int path_down(struct bch_fs *c, pathbuf *p, static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k) { struct bch_fs *c = trans->c; + struct btree_iter inode_iter = {}; struct bch_inode_unpacked inode; + struct printbuf buf = PRINTBUF; u32 snapshot = bch2_snapshot_equiv(c, inode_k.k->p.snapshot); int ret = 0; @@ -2134,14 +2136,12 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino if (bch2_err_matches(ret, ENOENT)) { if (fsck_err(c, inode_unreachable, - "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu", - inode.bi_inum, snapshot, - bch2_d_type_str(inode_d_type(&inode)), - inode.bi_nlink, - inode.bi_dir, - inode.bi_dir_offset)) + "unreachable inode\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, inode_k), + buf.buf))) ret = reattach_inode(trans, &inode, snapshot); - break; + goto out; } bch2_trans_iter_exit(trans, &dirent_iter); @@ -2157,7 +2157,12 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino snapshot = parent_snapshot; - ret = lookup_inode(trans, inode.bi_dir, &inode, &snapshot); + bch2_trans_iter_exit(trans, &inode_iter); + inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, + SPOS(0, inode.bi_dir, snapshot), 0); + ret = bkey_err(inode_k) ?: + !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode + : bch2_inode_unpack(inode_k, &inode); if (ret) { /* Should have been caught in dirents pass */ if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -2165,6 +2170,8 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino break; } + snapshot = inode_k.k->p.snapshot; + if (path_is_dup(p, inode.bi_inum, snapshot)) { /* XXX print path */ bch_err(c, "directory structure loop"); @@ -2188,7 +2195,10 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino break; } } +out: fsck_err: + bch2_trans_iter_exit(trans, &inode_iter); + printbuf_exit(&buf); bch_err_fn(c, ret); return ret; } -- cgit v1.2.3 From 56e230473d395c953cfab78b876fcf0f62b455b1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 9 Feb 2024 16:04:50 -0500 Subject: bcachefs: Correctly reattach subvolumes Subvolumes need special handling to reattach - we always reattach them in the root subvolume's lost+found, and they need a slightly different kind of dirent. Signed-off-by: Kent Overstreet --- fs/bcachefs/dirent.c | 8 ++++---- fs/bcachefs/dirent.h | 2 +- fs/bcachefs/fsck.c | 28 +++++++++++++++++++++++----- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 0dfcb194cb1e..882b2a217b51 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -201,17 +201,17 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, } int bch2_dirent_create_snapshot(struct btree_trans *trans, - u64 dir, u32 snapshot, + u32 dir_subvol, u64 dir, u32 snapshot, const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, u64 *dir_offset, bch_str_hash_flags_t str_hash_flags) { - subvol_inum zero_inum = { 0 }; + subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir }; struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(trans, zero_inum, type, name, dst_inum); + dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum); ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; @@ -220,7 +220,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, dirent->k.p.snapshot = snapshot; ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, - zero_inum, snapshot, + dir_inum, snapshot, &dirent->k_i, str_hash_flags, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); *dir_offset = dirent->k.p.offset; diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 1cd033fbc58a..bee55cca2aa0 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -35,7 +35,7 @@ static inline unsigned dirent_val_u64s(unsigned len) int bch2_dirent_read_target(struct btree_trans *, subvol_inum, struct bkey_s_c_dirent, subvol_inum *); -int bch2_dirent_create_snapshot(struct btree_trans *, u64, u32, +int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, bch_str_hash_flags_t); diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 412a196263ea..572ff61c036d 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -252,7 +252,7 @@ create_lostfound: goto err; ret = bch2_dirent_create_snapshot(trans, - root_inode.bi_inum, snapshot, &root_hash_info, + 0, root_inode.bi_inum, snapshot, &root_hash_info, mode_to_type(lostfound->bi_mode), &lostfound_str, lostfound->bi_inum, @@ -275,9 +275,24 @@ static int reattach_inode(struct btree_trans *trans, char name_buf[20]; struct qstr name; u64 dir_offset = 0; + u32 dirent_snapshot = inode_snapshot; int ret; - ret = lookup_lostfound(trans, inode_snapshot, &lostfound); + if (inode->bi_subvol) { + inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; + + u64 root_inum; + ret = subvol_lookup(trans, inode->bi_parent_subvol, + &dirent_snapshot, &root_inum); + if (ret) + return ret; + + snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol); + } else { + snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); + } + + ret = lookup_lostfound(trans, dirent_snapshot, &lostfound); if (ret) return ret; @@ -291,14 +306,16 @@ static int reattach_inode(struct btree_trans *trans, dir_hash = bch2_hash_info_init(trans->c, &lostfound); - snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); name = (struct qstr) QSTR(name_buf); ret = bch2_dirent_create_snapshot(trans, - lostfound.bi_inum, inode_snapshot, + inode->bi_parent_subvol, lostfound.bi_inum, + dirent_snapshot, &dir_hash, inode_d_type(inode), - &name, inode->bi_inum, &dir_offset, + &name, + inode->bi_subvol ?: inode->bi_inum, + &dir_offset, BCH_HASH_SET_MUST_CREATE); if (ret) return ret; @@ -2135,6 +2152,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino } if (bch2_err_matches(ret, ENOENT)) { + ret = 0; if (fsck_err(c, inode_unreachable, "unreachable inode\n%s", (printbuf_reset(&buf), -- cgit v1.2.3 From 506b187603f18755d69c19559828a6a9ed48093a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 8 Feb 2024 19:10:19 -0500 Subject: bcachefs: bch2_btree_bit_mod -> bch2_btree_bit_mod_buffered Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update.c | 4 ++-- fs/bcachefs/btree_update.h | 4 ++-- fs/bcachefs/buckets.c | 3 ++- fs/bcachefs/inode.c | 5 +++-- fs/bcachefs/lru.c | 4 ++-- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index c3ff365acce9..390daa4b15b2 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -785,8 +785,8 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, return ret; } -int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, - struct bpos pos, bool set) +int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, + struct bpos pos, bool set) { struct bkey_i k; diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index b9382b7b288b..75ffd82e0fc4 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -62,12 +62,12 @@ int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, int bch2_btree_delete_range(struct bch_fs *, enum btree_id, struct bpos, struct bpos, unsigned, u64 *); -int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool); +int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool); static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans, enum btree_id btree, struct bpos pos) { - return bch2_btree_bit_mod(trans, btree, pos, false); + return bch2_btree_bit_mod_buffered(trans, btree, pos, false); } int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id, diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 7dca10ba70d2..c2f46b267b3a 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -1053,7 +1053,8 @@ int bch2_trigger_extent(struct btree_trans *trans, (int) bch2_bkey_needs_rebalance(c, old); if (mod) { - int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new.k->p, mod > 0); + int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, + new.k->p, mod > 0); if (ret) return ret; } diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 418d731b47d2..a3139bb66f77 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -620,7 +620,8 @@ int bch2_trigger_inode(struct btree_trans *trans, bool old_deleted = bkey_is_deleted_inode(old); bool new_deleted = bkey_is_deleted_inode(new.s_c); if (old_deleted != new_deleted) { - int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new.k->p, new_deleted); + int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, + new.k->p, new_deleted); if (ret) return ret; } @@ -1170,7 +1171,7 @@ fsck_err: bch2_trans_iter_exit(trans, &inode_iter); return ret; delete: - ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false); + ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false); goto out; } diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index 7a4ca5a28b3e..ed7577cdb212 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -44,8 +44,8 @@ static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time, bool set) { return time - ? bch2_btree_bit_mod(trans, BTREE_ID_lru, - lru_pos(lru_id, dev_bucket, time), set) + ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, + lru_pos(lru_id, dev_bucket, time), set) : 0; } -- cgit v1.2.3 From e07c28ab92619059833715f3c1c6abc33dd7ec7c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 8 Feb 2024 19:23:56 -0500 Subject: bcachefs: bch2_btree_bit_mod() Provide a non-write buffer version of bch2_btree_bit_mod_buffered(), for the subvolume children btree. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update.c | 21 +++++++++++++++++++++ fs/bcachefs/btree_update.h | 1 + 2 files changed, 22 insertions(+) diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 390daa4b15b2..45994fbac645 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -785,6 +785,27 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, return ret; } +int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, + struct bpos pos, bool set) +{ + struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); + int ret = PTR_ERR_OR_ZERO(k); + if (ret) + return ret; + + bkey_init(&k->k); + k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; + k->k.p = pos; + + struct btree_iter iter; + bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, k, 0); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, struct bpos pos, bool set) { diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 75ffd82e0fc4..cc7c53e83f89 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -62,6 +62,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, int bch2_btree_delete_range(struct bch_fs *, enum btree_id, struct bpos, struct bpos, unsigned, u64 *); +int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool); int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool); static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans, -- cgit v1.2.3 From b8628a2529e7d615361efdbfd6c9662b687fd828 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 8 Feb 2024 18:39:42 -0500 Subject: bcachefs: bch_subvolume::fs_path_parent Record the filesystem path heirarchy for subvolumes in bch_subvolume Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 3 ++- fs/bcachefs/fs-common.c | 32 ++++++++++++++++++++++++++++++++ fs/bcachefs/fsck.c | 33 +++++++++++++++++++++++++++------ fs/bcachefs/sb-downgrade.c | 5 ++++- fs/bcachefs/subvolume.c | 23 +++++++++++++++++++++-- fs/bcachefs/subvolume.h | 3 +-- fs/bcachefs/subvolume_format.h | 2 +- 7 files changed, 88 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 14f613617913..772eff5555f7 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -840,7 +840,8 @@ struct bch_sb_field_downgrade { x(snapshot_skiplists, BCH_VERSION(1, 1)) \ x(deleted_inodes, BCH_VERSION(1, 2)) \ x(rebalance_work, BCH_VERSION(1, 3)) \ - x(member_seq, BCH_VERSION(1, 4)) + x(member_seq, BCH_VERSION(1, 4)) \ + x(subvolume_fs_parent, BCH_VERSION(1, 5)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index 523507e38887..2aa388110597 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -107,6 +107,7 @@ int bch2_create_trans(struct btree_trans *trans, u32 new_subvol, dir_snapshot; ret = bch2_subvolume_create(trans, new_inode->bi_inum, + dir.subvol, snapshot_src.subvol, &new_subvol, &snapshot, (flags & BCH_CREATE_SNAPSHOT_RO) != 0); @@ -349,6 +350,22 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, return ret; } +static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_parent) +{ + struct btree_iter iter; + struct bkey_i_subvolume *s = + bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_subvolumes, POS(0, subvol), + BTREE_ITER_CACHED, subvolume); + int ret = PTR_ERR_OR_ZERO(s); + if (ret) + return ret; + + s->v.fs_path_parent = cpu_to_le32(new_parent); + bch2_trans_iter_exit(trans, &iter); + return 0; +} + int bch2_rename_trans(struct btree_trans *trans, subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u, subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u, @@ -410,6 +427,21 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; } + if (src_inode_u->bi_subvol && + dst_dir.subvol != src_inode_u->bi_parent_subvol) { + ret = subvol_update_parent(trans, src_inode_u->bi_subvol, dst_dir.subvol); + if (ret) + goto err; + } + + if (mode == BCH_RENAME_EXCHANGE && + dst_inode_u->bi_subvol && + src_dir.subvol != dst_inode_u->bi_parent_subvol) { + ret = subvol_update_parent(trans, dst_inode_u->bi_subvol, src_dir.subvol); + if (ret) + goto err; + } + /* Can't move across subvolumes, unless it's a subvolume root: */ if (src_dir.subvol != dst_dir.subvol && (!src_inode_u->bi_subvol || diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 572ff61c036d..797a5711216c 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1744,11 +1744,12 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * struct bkey_s_c_dirent d) { struct bch_fs *c = trans->c; + struct btree_iter subvol_iter = {}; struct bch_inode_unpacked subvol_root; u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); - u32 target_snapshot, parent_snapshot; - u64 target_inum, parent_inum; + u32 parent_snapshot; + u64 parent_inum; struct printbuf buf = PRINTBUF; int ret = 0; @@ -1777,8 +1778,11 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol); } - ret = subvol_lookup(trans, target_subvol, - &target_snapshot, &target_inum); + struct bkey_s_c_subvolume s = + bch2_bkey_get_iter_typed(trans, &subvol_iter, + BTREE_ID_subvolumes, POS(0, target_subvol), + 0, subvolume); + ret = bkey_err(s.s_c); if (ret && !bch2_err_matches(ret, ENOENT)) return ret; @@ -1791,8 +1795,24 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * goto out; } - ret = lookup_inode(trans, target_inum, - &subvol_root, &target_snapshot); + if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol, + c, subvol_fs_path_parent_wrong, + "subvol with wrong fs_path_parent, should be be %u\n%s", + parent_subvol, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { + struct bkey_i_subvolume *n = + bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + n->v.fs_path_parent = cpu_to_le32(parent_subvol); + } + + u64 target_inum = le64_to_cpu(s.v->inode); + u32 target_snapshot = le32_to_cpu(s.v->snapshot); + + ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot); if (ret && !bch2_err_matches(ret, ENOENT)) return ret; @@ -1814,6 +1834,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * out: err: fsck_err: + bch2_trans_iter_exit(trans, &subvol_iter); printbuf_exit(&buf); return ret; } diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 441dcb1bf160..12ee83e9e251 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -45,7 +45,10 @@ BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list) \ x(rebalance_work, \ - BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) + BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) \ + x(subvolume_fs_parent, \ + BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \ + BCH_FSCK_ERR_subvol_fs_path_parent_wrong) #define DOWNGRADE_TABLE() diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index a0be103b48fe..d365e84367a3 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -20,6 +20,7 @@ static int check_subvol(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bkey_s_c_subvolume subvol; struct bch_snapshot snapshot; + struct printbuf buf = PRINTBUF; unsigned snapid; int ret = 0; @@ -42,6 +43,20 @@ static int check_subvol(struct btree_trans *trans, return ret ?: -BCH_ERR_transaction_restart_nested; } + if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL && + subvol.v->fs_path_parent, + c, subvol_root_fs_path_parent_nonzero, + "root subvolume has nonzero fs_path_parent\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + struct bkey_i_subvolume *n = + bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + n->v.fs_path_parent = 0; + } + struct bch_inode_unpacked inode; struct btree_iter inode_iter = {}; ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode, @@ -102,9 +117,9 @@ static int check_subvol(struct btree_trans *trans, SET_BCH_SUBVOLUME_SNAP(&s->v, true); } } - err: fsck_err: + printbuf_exit(&buf); return ret; } @@ -143,8 +158,10 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, le64_to_cpu(s.v->inode), le32_to_cpu(s.v->snapshot)); - if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent)) + if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent)) { prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent)); + prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent)); + } } static __always_inline int @@ -391,6 +408,7 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) } int bch2_subvolume_create(struct btree_trans *trans, u64 inode, + u32 parent_subvolid, u32 src_subvolid, u32 *new_subvolid, u32 *new_snapshotid, @@ -451,6 +469,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); new_subvol->v.inode = cpu_to_le64(inode); new_subvol->v.creation_parent = cpu_to_le32(src_subvolid); + new_subvol->v.fs_path_parent = cpu_to_le32(parent_subvolid); new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c)); new_subvol->v.otime.hi = 0; diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index a6f56f66e27c..a2b0dc1303c2 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -30,8 +30,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *); void bch2_delete_dead_snapshots_async(struct bch_fs *); int bch2_subvolume_unlink(struct btree_trans *, u32); -int bch2_subvolume_create(struct btree_trans *, u64, u32, - u32 *, u32 *, bool); +int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool); int bch2_fs_subvolumes_init(struct bch_fs *); diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h index b81cf0c6119d..e029df7ba89f 100644 --- a/fs/bcachefs/subvolume_format.h +++ b/fs/bcachefs/subvolume_format.h @@ -20,7 +20,7 @@ struct bch_subvolume { * this subvolume: */ __le32 creation_parent; - __le32 pad; + __le32 fs_path_parent; bch_le128 otime; }; -- cgit v1.2.3 From b26d79147f5ffaeabbda1cad556761965d6d85fe Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 21 Jan 2024 06:00:07 -0500 Subject: bcachefs: BTREE_ID_subvolume_children Add a btree to record a parent -> child subvolume relationships, according to the filesystem heirarchy. The subvolume_children btree is a bitset btree: if a bit is set at pos p, that means p.offset is a child of subvolume p.inode. This will be used for efficiently listing subvolumes, as well as recursive deletion. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/bcachefs_format.h | 7 +++- fs/bcachefs/btree_types.h | 1 + fs/bcachefs/recovery_types.h | 1 + fs/bcachefs/sb-downgrade.c | 5 ++- fs/bcachefs/subvolume.c | 98 +++++++++++++++++++++++++++++++++++++++++++ fs/bcachefs/subvolume.h | 4 ++ 7 files changed, 114 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 69d0d60d50e3..1391c530d8d2 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -504,6 +504,7 @@ enum gc_phase { GC_PHASE_BTREE_deleted_inodes, GC_PHASE_BTREE_logged_ops, GC_PHASE_BTREE_rebalance_work, + GC_PHASE_BTREE_subvolume_children, GC_PHASE_PENDING_DELETE, }; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 772eff5555f7..1bb24aa73528 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -841,7 +841,8 @@ struct bch_sb_field_downgrade { x(deleted_inodes, BCH_VERSION(1, 2)) \ x(rebalance_work, BCH_VERSION(1, 3)) \ x(member_seq, BCH_VERSION(1, 4)) \ - x(subvolume_fs_parent, BCH_VERSION(1, 5)) + x(subvolume_fs_parent, BCH_VERSION(1, 5)) \ + x(btree_subvolume_children, BCH_VERSION(1, 6)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -1489,7 +1490,9 @@ enum btree_id_flags { BIT_ULL(KEY_TYPE_logged_op_truncate)| \ BIT_ULL(KEY_TYPE_logged_op_finsert)) \ x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \ - BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) + BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \ + x(subvolume_children, 19, 0, \ + BIT_ULL(KEY_TYPE_set)) enum btree_id { #define x(name, nr, ...) BTREE_ID_##name = nr, diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 94f996ef5d2b..d4f4e52ee6ed 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -654,6 +654,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type); BIT_ULL(BKEY_TYPE_inodes)| \ BIT_ULL(BKEY_TYPE_stripes)| \ BIT_ULL(BKEY_TYPE_reflink)| \ + BIT_ULL(BKEY_TYPE_subvolumes)| \ BIT_ULL(BKEY_TYPE_btree)) #define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \ diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h index fa0c8efd2a1b..f0fc1dbb7239 100644 --- a/fs/bcachefs/recovery_types.h +++ b/fs/bcachefs/recovery_types.h @@ -34,6 +34,7 @@ x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ + x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ x(fs_upgrade_for_subvolumes, 22, 0) \ x(resume_logged_ops, 23, PASS_ALWAYS) \ diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 12ee83e9e251..49bc4951572c 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -48,7 +48,10 @@ BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) \ x(subvolume_fs_parent, \ BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \ - BCH_FSCK_ERR_subvol_fs_path_parent_wrong) + BCH_FSCK_ERR_subvol_fs_path_parent_wrong) \ + x(btree_subvolume_children, \ + BIT_ULL(BCH_RECOVERY_PASS_check_subvols), \ + BCH_FSCK_ERR_subvol_children_not_set) #define DOWNGRADE_TABLE() diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index d365e84367a3..68be3a450ca1 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -13,12 +13,24 @@ static int bch2_subvolume_delete(struct btree_trans *, u32); +static struct bpos subvolume_children_pos(struct bkey_s_c k) +{ + if (k.k->type != KEY_TYPE_subvolume) + return POS_MIN; + + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); + if (!s.v->fs_path_parent) + return POS_MIN; + return POS(le32_to_cpu(s.v->fs_path_parent), s.k->p.offset); +} + static int check_subvol(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { struct bch_fs *c = trans->c; struct bkey_s_c_subvolume subvol; + struct btree_iter subvol_children_iter = {}; struct bch_snapshot snapshot; struct printbuf buf = PRINTBUF; unsigned snapid; @@ -57,6 +69,28 @@ static int check_subvol(struct btree_trans *trans, n->v.fs_path_parent = 0; } + if (subvol.v->fs_path_parent) { + struct bpos pos = subvolume_children_pos(k); + + struct bkey_s_c subvol_children_k = + bch2_bkey_get_iter(trans, &subvol_children_iter, + BTREE_ID_subvolume_children, pos, 0); + ret = bkey_err(subvol_children_k); + if (ret) + goto err; + + if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set, + c, subvol_children_not_set, + "subvolume not set in subvolume_children btree at %llu:%llu\n%s", + pos.inode, pos.offset, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, true); + if (ret) + goto err; + } + } + struct bch_inode_unpacked inode; struct btree_iter inode_iter = {}; ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode, @@ -119,6 +153,7 @@ static int check_subvol(struct btree_trans *trans, } err: fsck_err: + bch2_trans_iter_exit(trans, &subvol_children_iter); printbuf_exit(&buf); return ret; } @@ -134,6 +169,42 @@ int bch2_check_subvols(struct bch_fs *c) return ret; } +static int check_subvol_child(struct btree_trans *trans, + struct btree_iter *child_iter, + struct bkey_s_c child_k) +{ + struct bch_fs *c = trans->c; + struct bch_subvolume s; + int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset), + 0, subvolume, &s); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (fsck_err_on(ret || + le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode, + c, subvol_children_bad, + "incorrect entry in subvolume_children btree %llu:%llu", + child_k.k->p.inode, child_k.k->p.offset)) { + ret = bch2_btree_delete_at(trans, child_iter, 0); + if (ret) + goto err; + } +err: +fsck_err: + return ret; +} + +int bch2_check_subvol_children(struct bch_fs *c) +{ + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_subvol_child(trans, &iter, k))); + bch_err_fn(c, ret); + return 0; +} + /* Subvolumes: */ int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k, @@ -164,6 +235,33 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, } } +static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set) +{ + return !bpos_eq(pos, POS_MIN) + ? bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, set) + : 0; +} + +int bch2_subvolume_trigger(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s new, + unsigned flags) +{ + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + struct bpos children_pos_old = subvolume_children_pos(old); + struct bpos children_pos_new = subvolume_children_pos(new.s_c); + + if (!bpos_eq(children_pos_old, children_pos_new)) { + int ret = subvolume_children_mod(trans, children_pos_old, false) ?: + subvolume_children_mod(trans, children_pos_new, true); + if (ret) + return ret; + } + } + + return 0; +} + static __always_inline int bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, bool inconsistent_if_not_found, diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index a2b0dc1303c2..f0979ab56a47 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -8,14 +8,18 @@ enum bkey_invalid_flags; int bch2_check_subvols(struct bch_fs *); +int bch2_check_subvol_children(struct bch_fs *); int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_subvolume ((struct bkey_ops) { \ .key_invalid = bch2_subvolume_invalid, \ .val_to_text = bch2_subvolume_to_text, \ + .trigger = bch2_subvolume_trigger, \ .min_val_size = 16, \ }) -- cgit v1.2.3 From 835cd3e147a9a8f8acfe7f3a405c582f04e90a33 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 9 Feb 2024 21:01:04 -0500 Subject: bcachefs: Check for subvolume children when deleting subvolumes Recursively destroying subvolumes isn't allowed yet. Fixes: https://github.com/koverstreet/bcachefs/issues/634 Signed-off-by: Kent Overstreet --- fs/bcachefs/dirent.c | 2 +- fs/bcachefs/errcode.h | 2 ++ fs/bcachefs/fs-common.c | 23 +++++++++++++++-------- fs/bcachefs/subvolume.c | 13 +++++++++++++ fs/bcachefs/subvolume.h | 1 + 5 files changed, 32 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 882b2a217b51..d37bd07afbfe 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -525,7 +525,7 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol) continue; - ret = -ENOTEMPTY; + ret = -BCH_ERR_ENOTEMPTY_dir_not_empty; break; } bch2_trans_iter_exit(trans, &iter); diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index a82a9d754fda..fe3fc14d3c9a 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -109,6 +109,8 @@ x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ x(ENOENT, ENOENT_dev_not_found) \ x(ENOENT, ENOENT_dev_idx_not_found) \ + x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ + x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ x(0, open_buckets_empty) \ x(0, freelist_empty) \ x(BCH_ERR_freelist_empty, no_buckets_found) \ diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index 2aa388110597..624e6f963240 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -243,7 +243,7 @@ int bch2_unlink_trans(struct btree_trans *trans, struct bch_inode_unpacked *dir_u, struct bch_inode_unpacked *inode_u, const struct qstr *name, - bool deleting_snapshot) + bool deleting_subvol) { struct bch_fs *c = trans->c; struct btree_iter dir_iter = { NULL }; @@ -271,18 +271,25 @@ int bch2_unlink_trans(struct btree_trans *trans, if (ret) goto err; - if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) { + if (!deleting_subvol && S_ISDIR(inode_u->bi_mode)) { ret = bch2_empty_dir_trans(trans, inum); if (ret) goto err; } - if (deleting_snapshot && !inode_u->bi_subvol) { + if (deleting_subvol && !inode_u->bi_subvol) { ret = -BCH_ERR_ENOENT_not_subvol; goto err; } - if (deleting_snapshot || inode_u->bi_subvol) { + if (inode_u->bi_subvol) { + /* Recursive subvolume destroy not allowed (yet?) */ + ret = bch2_subvol_has_children(trans, inode_u->bi_subvol); + if (ret) + goto err; + } + + if (deleting_subvol || inode_u->bi_subvol) { ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol); if (ret) goto err; @@ -479,10 +486,10 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; } - if (S_ISDIR(dst_inode_u->bi_mode) && - bch2_empty_dir_trans(trans, dst_inum)) { - ret = -ENOTEMPTY; - goto err; + if (S_ISDIR(dst_inode_u->bi_mode)) { + ret = bch2_empty_dir_trans(trans, dst_inum); + if (ret) + goto err; } } diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 68be3a450ca1..ce7aed121942 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -262,6 +262,19 @@ int bch2_subvolume_trigger(struct btree_trans *trans, return 0; } +int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol) +{ + struct btree_iter iter; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0); + struct bkey_s_c k = bch2_btree_iter_peek(&iter); + bch2_trans_iter_exit(trans, &iter); + + return bkey_err(k) ?: k.k && k.k->p.inode == subvol + ? -BCH_ERR_ENOTEMPTY_subvol_not_empty + : 0; +} + static __always_inline int bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, bool inconsistent_if_not_found, diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index f0979ab56a47..903c05162c06 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -23,6 +23,7 @@ int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, .min_val_size = 16, \ }) +int bch2_subvol_has_children(struct btree_trans *, u32); int bch2_subvolume_get(struct btree_trans *, unsigned, bool, int, struct bch_subvolume *); int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); -- cgit v1.2.3 From 91dcad18d38849f1702e0d50f5598bb3614c8ff8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 23 Jan 2024 00:01:07 -0500 Subject: bcachefs: Pin btree cache in ram for random access in fsck Various phases of fsck involve checking references from one btree to another: this means doing a sequential scan of one btree, and then mostly random access into the second. This is particularly painful for checking extents <-> backpointers; we can prefetch btree node access on the sequential scan, but not on the random access portion, and this is particularly painful on spinning rust, where we'd like to keep the pipeline fairly full of btree node reads so that the elevator can reduce seeking. This patch implements prefetching and pinning of the portion of the btree that we'll be doing random access to. We already calculate how much of the random access btree will fit in memory so it's a fairly straightforward change. This will put more pressure on system memory usage, so we introduce a new option, fsck_memory_usage_percent, which is the percentage of total system ram that fsck is allowed to pin. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 137 ++++++++++++++++----------------------------- fs/bcachefs/bbpos_types.h | 2 +- fs/bcachefs/btree_cache.c | 13 +++++ fs/bcachefs/btree_types.h | 6 ++ fs/bcachefs/opts.h | 5 ++ 5 files changed, 72 insertions(+), 91 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index f9ccefc74c49..f2b33fe40bd8 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -554,60 +554,61 @@ static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) }; } -static size_t btree_nodes_fit_in_ram(struct bch_fs *c) +static u64 mem_may_pin_bytes(struct bch_fs *c) { struct sysinfo i; - u64 mem_bytes; - si_meminfo(&i); - mem_bytes = i.totalram * i.mem_unit; - return div_u64(mem_bytes >> 1, c->opts.btree_node_size); + + u64 mem_bytes = i.totalram * i.mem_unit; + return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100); +} + +static size_t btree_nodes_fit_in_ram(struct bch_fs *c) +{ + return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size); } static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, - unsigned btree_leaf_mask, - unsigned btree_interior_mask, + u64 btree_leaf_mask, + u64 btree_interior_mask, struct bbpos start, struct bbpos *end) { - struct btree_iter iter; - struct bkey_s_c k; - size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); - enum btree_id btree; + struct bch_fs *c = trans->c; + s64 mem_may_pin = mem_may_pin_bytes(c); int ret = 0; - for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) { - unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2; + btree_interior_mask |= btree_leaf_mask; + + c->btree_cache.pinned_nodes_leaf_mask = btree_leaf_mask; + c->btree_cache.pinned_nodes_interior_mask = btree_interior_mask; + c->btree_cache.pinned_nodes_start = start; + c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX; + + for (enum btree_id btree = start.btree; + btree < BTREE_ID_NR && !ret; + btree++) { + unsigned depth = ((1U << btree) & btree_leaf_mask) ? 0 : 1; + struct btree_iter iter; + struct btree *b; if (!((1U << btree) & btree_leaf_mask) && !((1U << btree) & btree_interior_mask)) continue; - bch2_trans_node_iter_init(trans, &iter, btree, - btree == start.btree ? start.pos : POS_MIN, - 0, depth, 0); - /* - * for_each_btree_key_contineu() doesn't check the return value - * from bch2_btree_iter_advance(), which is needed when - * iterating over interior nodes where we'll see keys at - * SPOS_MAX: - */ - do { - k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0); - ret = bkey_err(k); - if (!k.k || ret) - break; - - --btree_nodes; - if (!btree_nodes) { - *end = BBPOS(btree, k.k->p); + __for_each_btree_node(trans, iter, btree, + btree == start.btree ? start.pos : POS_MIN, + 0, depth, BTREE_ITER_PREFETCH, b, ret) { + mem_may_pin -= btree_buf_bytes(b); + if (mem_may_pin <= 0) { + c->btree_cache.pinned_nodes_end = *end = + BBPOS(btree, b->key.k.p); bch2_trans_iter_exit(trans, &iter); return 0; } - } while (bch2_btree_iter_advance(&iter)); + } bch2_trans_iter_exit(trans, &iter); } - *end = BBPOS_MAX; return ret; } @@ -665,62 +666,6 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, return 0; } -static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c, - struct bpos bucket) -{ - return bch2_dev_exists2(c, bucket.inode) - ? bucket_pos_to_bp(c, bucket, 0) - : bucket; -} - -static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, - struct bpos start, struct bpos *end) -{ - struct btree_iter alloc_iter; - struct btree_iter bp_iter; - struct bkey_s_c alloc_k, bp_k; - size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); - bool alloc_end = false, bp_end = false; - int ret = 0; - - bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc, - start, 0, 1, 0); - bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0); - while (1) { - alloc_k = !alloc_end - ? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0) - : bkey_s_c_null; - bp_k = !bp_end - ? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0) - : bkey_s_c_null; - - ret = bkey_err(alloc_k) ?: bkey_err(bp_k); - if ((!alloc_k.k && !bp_k.k) || ret) { - *end = SPOS_MAX; - break; - } - - --btree_nodes; - if (!btree_nodes) { - *end = alloc_k.k ? alloc_k.k->p : SPOS_MAX; - break; - } - - if (bpos_lt(alloc_iter.pos, SPOS_MAX) && - bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) { - if (!bch2_btree_iter_advance(&alloc_iter)) - alloc_end = true; - } else { - if (!bch2_btree_iter_advance(&bp_iter)) - bp_end = true; - } - } - bch2_trans_iter_exit(trans, &bp_iter); - bch2_trans_iter_exit(trans, &alloc_iter); - return ret; -} - int bch2_check_extents_to_backpointers(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); @@ -731,10 +676,16 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) bkey_init(&s.last_flushed.k->k); while (1) { - ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end); + struct bbpos end; + ret = bch2_get_btree_in_memory_pos(trans, + BIT_ULL(BTREE_ID_backpointers), + BIT_ULL(BTREE_ID_backpointers), + BBPOS(BTREE_ID_backpointers, s.bucket_start), &end); if (ret) break; + s.bucket_end = end.pos; + if ( bpos_eq(s.bucket_start, POS_MIN) && !bpos_eq(s.bucket_end, SPOS_MAX)) bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", @@ -762,6 +713,9 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) bch2_trans_put(trans); bch2_bkey_buf_exit(&s.last_flushed, c); + c->btree_cache.pinned_nodes_leaf_mask = 0; + c->btree_cache.pinned_nodes_interior_mask = 0; + bch_err_fn(c, ret); return ret; } @@ -867,6 +821,9 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) } bch2_trans_put(trans); + c->btree_cache.pinned_nodes_leaf_mask = 0; + c->btree_cache.pinned_nodes_interior_mask = 0; + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h index 5198e94cf3b8..f63893344f80 100644 --- a/fs/bcachefs/bbpos_types.h +++ b/fs/bcachefs/bbpos_types.h @@ -13,6 +13,6 @@ static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos) } #define BBPOS_MIN BBPOS(0, POS_MIN) -#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX) +#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, SPOS_MAX) #endif /* _BCACHEFS_BBPOS_TYPES_H */ diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index e8665258360e..562561a9a510 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bbpos.h" #include "bkey_buf.h" #include "btree_cache.h" #include "btree_io.h" @@ -208,6 +209,18 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) int ret = 0; lockdep_assert_held(&bc->lock); + + struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); + + u64 mask = b->c.level + ? bc->pinned_nodes_interior_mask + : bc->pinned_nodes_leaf_mask; + + if ((mask & BIT_ULL(b->c.btree_id)) && + bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && + bbpos_cmp(bc->pinned_nodes_end, pos) >= 0) + return -BCH_ERR_ENOMEM_btree_node_reclaim; + wait_on_io: if (b->flags & ((1U << BTREE_NODE_dirty)| (1U << BTREE_NODE_read_in_flight)| diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index d4f4e52ee6ed..9404d96c38f3 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -5,6 +5,7 @@ #include #include +#include "bbpos_types.h" #include "btree_key_cache_types.h" #include "buckets_types.h" #include "darray.h" @@ -173,6 +174,11 @@ struct btree_cache { */ struct task_struct *alloc_lock; struct closure_waitlist alloc_wait; + + struct bbpos pinned_nodes_start; + struct bbpos pinned_nodes_end; + u64 pinned_nodes_leaf_mask; + u64 pinned_nodes_interior_mask; }; struct btree_node_iter { diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 9a83aca31b4b..136083c11f3a 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -337,6 +337,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Run fsck on mount") \ + x(fsck_memory_usage_percent, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_UINT(20, 70), \ + BCH2_NO_SB_OPT, 50, \ + NULL, "Maximum percentage of system ram fsck is allowed to pin")\ x(fix_errors, u8, \ OPT_FS|OPT_MOUNT, \ OPT_FN(bch2_opt_fix_errors), \ -- cgit v1.2.3 From 3254c1b0e5af7fcb90f2feb928e5313c8a53149e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 9 Feb 2024 20:15:03 -0500 Subject: bcachefs: Save key_cache_path in peek_slot() When bch2_btree_iter_peek_slot() clones the iterator to search for the next key, and then discovers that the key from the cloned iterator is the key we want to return - we also want to save the iter->key_cache_path as well, for the update path. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 4cb43c676ee1..826ad6fa74cf 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2505,6 +2505,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) k = bch2_btree_iter_peek_upto(&iter2, end); if (k.k && !bkey_err(k)) { + swap(iter->key_cache_path, iter2.key_cache_path); iter->k = iter2.k; k.k = &iter->k; } -- cgit v1.2.3 From 83bd5985fa5437a21f6cf9974e3d023e98af490b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 9 Feb 2024 20:16:41 -0500 Subject: bcachefs: Track iter->ip_allocated at bch2_trans_copy_iter() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 826ad6fa74cf..f7eec6d398f8 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2765,6 +2765,9 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) struct btree_trans *trans = src->trans; *dst = *src; +#ifdef TRACK_PATH_ALLOCATED + dst->ip_allocated = _RET_IP_; +#endif if (src->path) __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT); if (src->update_path) -- cgit v1.2.3 From 5ca8ff157d9712b45d249b97b28e35b0d5eead8e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 12 Feb 2024 15:17:14 -0500 Subject: bcachefs: Use kvzalloc() when dynamically allocating btree paths THis silences a mm/page_alloc.c warning about allocating more than a page with GFP_NOFAIL - and there's no reason for this to not have a vmalloc fallback anyways. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index f7eec6d398f8..51bcdc6c6d1c 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1520,7 +1520,7 @@ static noinline void btree_paths_realloc(struct btree_trans *trans) { unsigned nr = trans->nr_paths * 2; - void *p = kzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) + + void *p = kvzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) + sizeof(struct btree_trans_paths) + nr * sizeof(struct btree_path) + nr * sizeof(btree_path_idx_t) + 8 + @@ -3091,7 +3091,7 @@ void bch2_trans_put(struct btree_trans *trans) trans->paths = NULL; if (paths_allocated != trans->_paths_allocated) - kfree_rcu_mightsleep(paths_allocated); + kvfree_rcu_mightsleep(paths_allocated); if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) mempool_free(trans->mem, &c->btree_trans_mem_pool); -- cgit v1.2.3 From 130d229ff56ccce4588448bdd45867f7497760c8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 12 Feb 2024 15:19:22 -0500 Subject: bcachefs: Improve error messages in device remove path Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index eeeb01156444..6c0d5dcbc7a3 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1601,27 +1601,27 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) __bch2_dev_read_only(c, ca); ret = bch2_dev_data_drop(c, ca->dev_idx, flags); - bch_err_msg(ca, ret, "dropping data"); + bch_err_msg(ca, ret, "bch2_dev_data_drop()"); if (ret) goto err; ret = bch2_dev_remove_alloc(c, ca); - bch_err_msg(ca, ret, "deleting alloc info"); + bch_err_msg(ca, ret, "bch2_dev_remove_alloc()"); if (ret) goto err; ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); - bch_err_msg(ca, ret, "flushing journal"); + bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()"); if (ret) goto err; ret = bch2_journal_flush(&c->journal); - bch_err(ca, "journal error"); + bch_err_msg(ca, ret, "bch2_journal_flush()"); if (ret) goto err; ret = bch2_replicas_gc2(c); - bch_err_msg(ca, ret, "in replicas_gc2()"); + bch_err_msg(ca, ret, "bch2_replicas_gc2()"); if (ret) goto err; -- cgit v1.2.3 From b63570f74733851579f489f092b6c28077f57c4a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 12 Feb 2024 17:15:29 -0500 Subject: bcachefs: bch2_print_opts() Make sure early error messages get redirected, for kernel-fsck-from-userland. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 3 +++ fs/bcachefs/super-io.c | 13 +++++++------ fs/bcachefs/super.c | 17 +++++++++++++++++ 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 1391c530d8d2..ca80ecf3b8bc 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -265,6 +265,9 @@ do { \ #define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") +__printf(2, 3) +void bch2_print_opts(struct bch_opts *, const char *, ...); + __printf(2, 3) void __bch2_print(struct bch_fs *c, const char *fmt, ...); diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 36988add581f..a3a9e85ab03c 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -717,6 +717,7 @@ retry: if (IS_ERR(sb->bdev_handle)) { ret = PTR_ERR(sb->bdev_handle); + prt_printf(&err, "error opening %s: %s", path, bch2_err_str(ret)); goto err; } sb->bdev = sb->bdev_handle->bdev; @@ -743,9 +744,9 @@ retry: prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n", path, err.buf); if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg) - printk(KERN_INFO "%s", err2.buf); + bch2_print_opts(opts, KERN_INFO "%s", err2.buf); else - printk(KERN_ERR "%s", err2.buf); + bch2_print_opts(opts, KERN_ERR "%s", err2.buf); printbuf_exit(&err2); printbuf_reset(&err); @@ -808,16 +809,16 @@ got_super: ret = bch2_sb_validate(sb, &err, READ); if (ret) { - printk(KERN_ERR "bcachefs (%s): error validating superblock: %s\n", - path, err.buf); + bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n", + path, err.buf); goto err_no_print; } out: printbuf_exit(&err); return ret; err: - printk(KERN_ERR "bcachefs (%s): error reading superblock: %s\n", - path, err.buf); + bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error reading superblock: %s\n", + path, err.buf); err_no_print: bch2_free_super(sb); goto out; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 6c0d5dcbc7a3..961b25860c3b 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -87,6 +87,23 @@ const char * const bch2_fs_flag_strs[] = { NULL }; +void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...) +{ + struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio; + + va_list args; + va_start(args, fmt); + if (likely(!stdio)) { + vprintk(fmt, args); + } else { + if (fmt[0] == KERN_SOH[0]) + fmt += 2; + + bch2_stdio_redirect_vprintf(stdio, true, fmt, args); + } + va_end(args); +} + void __bch2_print(struct bch_fs *c, const char *fmt, ...) { struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); -- cgit v1.2.3 From 6e9d0558b1ad285eb371e309b36cdcf2d8f8d138 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 15 Feb 2024 21:42:10 -0500 Subject: bcachefs: bch2_trigger_alloc() handles state changes better bch2_trigger_alloc() kicks off certain tasks on bucket state changes; e.g. triggering the bucket discard worker and the invalidate worker. We've observed the discard worker running too often - most runs it doesn't do any work, according to the tracepoint - so clearly, we're kicking it off too often. This adds an explicit statechange() macro to make these checks more precise. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index fd3e175d8342..c7be6afe8955 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -860,23 +860,28 @@ int bch2_trigger_alloc(struct btree_trans *trans, *bucket_gen(ca, new.k->p.offset) = new_a->gen; bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false); + percpu_up_read(&c->mark_lock); + +#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) +#define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr) +#define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk) - if (new_a->data_type == BCH_DATA_free && - (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk)) + if (statechange(a->data_type == BCH_DATA_free && + bucket_flushed(a))) closure_wake_up(&c->freelist_wait); - if (new_a->data_type == BCH_DATA_need_discard && - (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk)) + if (statechange(a->data_type == BCH_DATA_need_discard && + bucket_flushed(a)) && + !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset)) bch2_do_discards(c); - if (old_a->data_type != BCH_DATA_cached && - new_a->data_type == BCH_DATA_cached && + if (statechange(a->data_type == BCH_DATA_cached) && + !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) bch2_do_invalidates(c); - if (new_a->data_type == BCH_DATA_need_gc_gens) + if (statechange(a->data_type == BCH_DATA_need_gc_gens)) bch2_do_gc_gens(c); - percpu_up_read(&c->mark_lock); } if ((flags & BTREE_TRIGGER_GC) && -- cgit v1.2.3 From b07ce7262636fb294d786c17ec4e857b62c2970b Mon Sep 17 00:00:00 2001 From: Thomas Bertschinger Date: Thu, 15 Feb 2024 19:44:21 -0700 Subject: bcachefs: omit alignment attribute on big endian struct bkey This is needed for building Rust bindings on big endian architectures like s390x. Currently this is only done in userspace, but it might happen in-kernel in the future. When creating a Rust binding for struct bkey, the "packed" attribute is needed to get a type with the correct member offsets in the big endian case. However, rustc does not allow types to have both a "packed" and "align" attribute. Thus, in order to get a Rust type compatible with the C type, we must omit the "aligned" attribute in C. This does not affect the struct's size or member offsets, only its toplevel alignment, which should be an acceptable impact. The little endian version can have the "align" attribute because the "packed" attr is redundant, and rust-bindgen will omit the "packed" attr when an "align" attr is present and it can do so without changing a type's layout Signed-off-by: Thomas Bertschinger Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 1bb24aa73528..bff8750ac0d7 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -189,7 +189,11 @@ struct bversion { __u32 hi; __u64 lo; #endif -} __packed __aligned(4); +} __packed +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +__aligned(4) +#endif +; struct bkey { /* Size of combined key and value, in u64s */ @@ -222,7 +226,36 @@ struct bkey { __u8 pad[1]; #endif -} __packed __aligned(8); +} __packed +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +/* + * The big-endian version of bkey can't be compiled by rustc with the "aligned" + * attr since it doesn't allow types to have both "packed" and "aligned" attrs. + * So for Rust compatibility, don't include this. It can be included in the LE + * version because the "packed" attr is redundant in that case. + * + * History: (quoting Kent) + * + * Specifically, when i was designing bkey, I wanted the header to be no + * bigger than necessary so that bkey_packed could use the rest. That means that + * decently offten extent keys will fit into only 8 bytes, instead of spilling over + * to 16. + * + * But packed_bkey treats the part after the header - the packed section - + * as a single multi word, variable length integer. And bkey, the unpacked + * version, is just a special case version of a bkey_packed; all the packed + * bkey code will work on keys in any packed format, the in-memory + * representation of an unpacked key also is just one type of packed key... + * + * So that constrains the key part of a bkig endian bkey to start right + * after the header. + * + * If we ever do a bkey_v2 and need to expand the hedaer by another byte for + * some reason - that will clean up this wart. + */ +__aligned(8) +#endif +; struct bkey_packed { __u64 _data[0]; -- cgit v1.2.3 From 663db5a55486017ab6564b7014f0db97432598a0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 15 Feb 2024 22:50:42 -0500 Subject: bcachefs: bch2_check_subvolume_structure() Now that we've got bch_subvolume.fs_path_parent, it's easy to write subvolume Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 160 +++++++++++++++++++++++++++++++++++-------- fs/bcachefs/fsck.h | 1 + fs/bcachefs/recovery_types.h | 1 + 3 files changed, 135 insertions(+), 27 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 797a5711216c..388f181eb0f2 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -342,6 +342,27 @@ static int remove_backpointer(struct btree_trans *trans, return ret; } +static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s) +{ + struct bch_fs *c = trans->c; + + struct bch_inode_unpacked inode; + int ret = bch2_inode_find_by_inum_trans(trans, + (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, + &inode); + if (ret) + return ret; + + ret = remove_backpointer(trans, &inode); + bch_err_msg(c, ret, "removing dirent"); + if (ret) + return ret; + + ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot)); + bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); + return ret; +} + struct snapshots_seen_entry { u32 id; u32 equiv; @@ -2108,6 +2129,107 @@ int bch2_check_root(struct bch_fs *c) return ret; } +typedef DARRAY(u32) darray_u32; + +static bool darray_u32_has(darray_u32 *d, u32 v) +{ + darray_for_each(*d, i) + if (*i == v) + return true; + return false; +} + +/* + * We've checked that inode backpointers point to valid dirents; here, it's + * sufficient to check that the subvolume root has a dirent: + */ +static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s) +{ + struct bch_inode_unpacked inode; + int ret = bch2_inode_find_by_inum_trans(trans, + (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, + &inode); + if (ret) + return ret; + + return inode.bi_dir != 0; +} + +static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct btree_iter parent_iter = {}; + darray_u32 subvol_path = {}; + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (k.k->type != KEY_TYPE_subvolume) + return 0; + + while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) { + ret = darray_push(&subvol_path, k.k->p.offset); + if (ret) + goto err; + + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); + + ret = subvol_has_dirent(trans, s); + if (ret < 0) + break; + + if (fsck_err_on(!ret, + c, subvol_unreachable, + "unreachable subvolume %s", + (bch2_bkey_val_to_text(&buf, c, s.s_c), + buf.buf))) { + ret = reattach_subvol(trans, s); + break; + } + + u32 parent = le32_to_cpu(s.v->fs_path_parent); + + if (darray_u32_has(&subvol_path, parent)) { + if (fsck_err(c, subvol_loop, "subvolume loop")) + ret = reattach_subvol(trans, s); + break; + } + + bch2_trans_iter_exit(trans, &parent_iter); + bch2_trans_iter_init(trans, &parent_iter, + BTREE_ID_subvolumes, POS(0, parent), 0); + k = bch2_btree_iter_peek_slot(&parent_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (fsck_err_on(k.k->type != KEY_TYPE_subvolume, + c, subvol_unreachable, + "unreachable subvolume %s", + (bch2_bkey_val_to_text(&buf, c, s.s_c), + buf.buf))) { + ret = reattach_subvol(trans, s); + break; + } + } +fsck_err: +err: + printbuf_exit(&buf); + darray_exit(&subvol_path); + bch2_trans_iter_exit(trans, &parent_iter); + return ret; +} + +int bch2_check_subvolume_structure(struct bch_fs *c) +{ + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_subvol_path(trans, &iter, k))); + bch_err_fn(c, ret); + return ret; +} + struct pathbuf_entry { u64 inum; u32 snapshot; @@ -2124,20 +2246,6 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) return false; } -static int path_down(struct bch_fs *c, pathbuf *p, - u64 inum, u32 snapshot) -{ - int ret = darray_push(p, ((struct pathbuf_entry) { - .inum = inum, - .snapshot = snapshot, - })); - - if (ret) - bch_err(c, "fsck: error allocating memory for pathbuf, size %zu", - p->size); - return ret; -} - /* * Check that a given inode is reachable from the root: * @@ -2188,11 +2296,12 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino if (!S_ISDIR(inode.bi_mode)) break; - ret = path_down(c, p, inode.bi_inum, snapshot); - if (ret) { - bch_err(c, "memory allocation failure"); + ret = darray_push(p, ((struct pathbuf_entry) { + .inum = inode.bi_inum, + .snapshot = snapshot, + })); + if (ret) return ret; - } snapshot = parent_snapshot; @@ -2219,18 +2328,15 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino pr_err("%llu:%u", i->inum, i->snapshot); pr_err("%llu:%u", inode.bi_inum, snapshot); - if (!fsck_err(c, dir_loop, "directory structure loop")) - return 0; - - ret = remove_backpointer(trans, &inode); - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + if (fsck_err(c, dir_loop, "directory structure loop")) { + ret = remove_backpointer(trans, &inode); bch_err_msg(c, ret, "removing dirent"); - if (ret) - break; + if (ret) + break; - ret = reattach_inode(trans, &inode, snapshot); - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = reattach_inode(trans, &inode, snapshot); bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); + } break; } } diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h index da991e8cf27e..a4ef94271784 100644 --- a/fs/bcachefs/fsck.h +++ b/fs/bcachefs/fsck.h @@ -8,6 +8,7 @@ int bch2_check_indirect_extents(struct bch_fs *); int bch2_check_dirents(struct bch_fs *); int bch2_check_xattrs(struct bch_fs *); int bch2_check_root(struct bch_fs *); +int bch2_check_subvolume_structure(struct bch_fs *); int bch2_check_directory_structure(struct bch_fs *); int bch2_check_nlinks(struct bch_fs *); int bch2_fix_reflink_p(struct bch_fs *); diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h index f0fc1dbb7239..1361e34d4e64 100644 --- a/fs/bcachefs/recovery_types.h +++ b/fs/bcachefs/recovery_types.h @@ -44,6 +44,7 @@ x(check_dirents, 27, PASS_FSCK) \ x(check_xattrs, 28, PASS_FSCK) \ x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ + x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ x(check_nlinks, 31, PASS_FSCK) \ x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \ -- cgit v1.2.3 From 74406f66adc9c432856dca992e706d54c540c30c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 15 Feb 2024 23:59:05 -0500 Subject: bcachefs: check_path() now only needs to walk up to subvolume root Now that checking subvolume structure is a separate pass, the main check_directory_connectivity() pass only needs to walk up to a given inode's subvolume root. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 388f181eb0f2..f48033be3f6b 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2247,7 +2247,8 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) } /* - * Check that a given inode is reachable from the root: + * Check that a given inode is reachable from its subvolume root - we already + * verified subvolume connectivity: * * XXX: we should also be verifying that inodes are in the right subvolumes */ @@ -2264,8 +2265,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino BUG_ON(bch2_inode_unpack(inode_k, &inode)); - while (!(inode.bi_inum == BCACHEFS_ROOT_INO && - inode.bi_subvol == BCACHEFS_ROOT_SUBVOL)) { + while (!inode.bi_subvol) { struct btree_iter dirent_iter; struct bkey_s_c_dirent d; u32 parent_snapshot = snapshot; -- cgit v1.2.3 From 3235e04afef8b4ad72d92f05f8f33f6552793223 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 16 Feb 2024 20:03:12 -0500 Subject: bcachefs: more informative write path error message Signed-off-by: Kent Overstreet --- fs/bcachefs/io_write.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 2c098ac017b3..150c272368bf 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -530,7 +530,8 @@ static void __bch2_write_index(struct bch_write_op *op) bch_err_inum_offset_ratelimited(c, insert->k.p.inode, insert->k.p.offset << 9, - "write error while doing btree update: %s", + "%s write error while doing btree update: %s", + op->flags & BCH_WRITE_MOVE ? "move" : "user", bch2_err_str(ret)); } @@ -1067,7 +1068,8 @@ do_write: *_dst = dst; return more; csum_err: - bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)"); + bch_err(c, "%s writ error: error verifying existing checksum while rewriting existing data (memory corruption?)", + op->flags & BCH_WRITE_MOVE ? "move" : "user"); ret = -EIO; err: if (to_wbio(dst)->bounce) @@ -1169,7 +1171,8 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) bch_err_inum_offset_ratelimited(c, insert->k.p.inode, insert->k.p.offset << 9, - "write error while doing btree update: %s", + "%s write error while doing btree update: %s", + op->flags & BCH_WRITE_MOVE ? "move" : "user", bch2_err_str(ret)); } @@ -1449,7 +1452,9 @@ err: bch_err_inum_offset_ratelimited(c, op->pos.inode, op->pos.offset << 9, - "%s(): error: %s", __func__, bch2_err_str(ret)); + "%s(): %s error: %s", __func__, + op->flags & BCH_WRITE_MOVE ? "move" : "user", + bch2_err_str(ret)); op->error = ret; break; } @@ -1573,7 +1578,8 @@ CLOSURE_CALLBACK(bch2_write) bch_err_inum_offset_ratelimited(c, op->pos.inode, op->pos.offset << 9, - "misaligned write"); + "%s write error: misaligned write", + op->flags & BCH_WRITE_MOVE ? "move" : "user"); op->error = -EIO; goto err; } -- cgit v1.2.3 From ba78af9e56662f73a277f84619a0b623a75f19ff Mon Sep 17 00:00:00 2001 From: Daniel Hill Date: Fri, 19 Jan 2024 00:27:44 +1300 Subject: bcachefs: rebalance_status now shows correct units Signed-off-by: Daniel Hill Signed-off-by: Kent Overstreet --- fs/bcachefs/rebalance.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 22d1017aa49b..56336f3dd1d0 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -412,11 +412,11 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) u64 now = atomic64_read(&c->io_clock[WRITE].now); prt_str(out, "io wait duration: "); - bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start); + bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9); prt_newline(out); prt_str(out, "io wait remaining: "); - bch2_prt_human_readable_s64(out, r->wait_iotime_end - now); + bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9); prt_newline(out); prt_str(out, "duration waited: "); -- cgit v1.2.3 From 29e11f96993dd10210bfda0e09f43f307afc3639 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 16 Feb 2024 23:50:05 -0500 Subject: bcachefs: Drop redundant btree_path_downgrade()s If a path doesn't have any active references, we shouldn't downgrade it; it'll either be reused, possibly with intent refs again, or dropped at bch2_trans_begin() time. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_locking.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 684397442338..b9b151e693ed 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -747,7 +747,8 @@ void bch2_trans_downgrade(struct btree_trans *trans) return; trans_for_each_path(trans, path, i) - bch2_btree_path_downgrade(trans, path); + if (path->ref) + bch2_btree_path_downgrade(trans, path); } int bch2_trans_relock(struct btree_trans *trans) -- cgit v1.2.3 From 06d493fee43be69580082f7b9b48c4d3f9c9de64 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 17 Feb 2024 03:26:19 -0500 Subject: bcachefs: improve bch2_journal_buf_to_text() Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index f5be8e417c8a..597733a84d38 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -53,33 +53,48 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 unsigned i = seq & JOURNAL_BUF_MASK; struct journal_buf *buf = j->buf + i; - prt_printf(out, "seq:"); + prt_str(out, "seq:"); prt_tab(out); prt_printf(out, "%llu", seq); prt_newline(out); printbuf_indent_add(out, 2); - prt_printf(out, "refcount:"); + prt_str(out, "refcount:"); prt_tab(out); prt_printf(out, "%u", journal_state_count(s, i)); prt_newline(out); - prt_printf(out, "size:"); + prt_str(out, "size:"); prt_tab(out); prt_human_readable_u64(out, vstruct_bytes(buf->data)); prt_newline(out); - prt_printf(out, "expires"); + prt_str(out, "expires:"); prt_tab(out); prt_printf(out, "%li jiffies", buf->expires - jiffies); prt_newline(out); + prt_str(out, "flags:"); + prt_tab(out); + if (buf->noflush) + prt_str(out, "noflush "); + if (buf->must_flush) + prt_str(out, "must_flush "); + if (buf->separate_flush) + prt_str(out, "separate_flush "); + if (buf->need_flush_to_write_buffer) + prt_str(out, "need_flush_to_write_buffer "); + if (buf->need_flush_to_write_buffer) + prt_str(out, "need_flush_to_write_buffer "); + if (buf->write_done) + prt_str(out, "write done "); + if (buf->write_started) + prt_str(out, "write started "); + if (buf->write_allocated) + prt_str(out, "write allocated "); if (buf->write_done) - prt_printf(out, "write done\n"); - else if (buf->write_allocated) - prt_printf(out, "write allocated\n"); - else if (buf->write_started) - prt_printf(out, "write started\n"); + prt_str(out, "write done"); + prt_newline(out); printbuf_indent_sub(out, 2); } -- cgit v1.2.3 From a393f3312387597d2e3b6b44bf50ef7f105f623d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 16 Feb 2024 01:08:25 -0500 Subject: bcachefs: Split out discard fastpath Buckets usually can't be discarded until the transaction that made them empty has been committed in the journal. Tracing has indicated that we're queuing the discard worker excessively, only for it to skip over many buckets that are still waiting on a journal commit, discarding only one or two buckets per iteration. We want to switch to only queuing the discard worker after a journal flush write, but there's an important optimization we need to preserve: if a bucket becomes empty and it was never committed in the journal while it was in use, we want to discard it and reuse it right away - since overwriting it before the previous writes are flushed from the device cache eans those writes only cost bus bandwidth. So, this patch implements a fast path for buckets that can be discarded right away. We need new locking between the two discard workers; the new list of buckets being discarded provides that locking. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 146 +++++++++++++++++++++++++++++++++++++++-- fs/bcachefs/alloc_background.h | 1 + fs/bcachefs/bcachefs.h | 6 +- 3 files changed, 146 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index c7be6afe8955..ccd6cbfd470e 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -29,6 +29,8 @@ #include #include +static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket); + /* Persistent alloc info: */ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { @@ -866,14 +868,14 @@ int bch2_trigger_alloc(struct btree_trans *trans, #define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr) #define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk) - if (statechange(a->data_type == BCH_DATA_free && - bucket_flushed(a))) + if (statechange(a->data_type == BCH_DATA_free) && + bucket_flushed(new_a)) closure_wake_up(&c->freelist_wait); - if (statechange(a->data_type == BCH_DATA_need_discard && - bucket_flushed(a)) && - !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset)) - bch2_do_discards(c); + if (statechange(a->data_type == BCH_DATA_need_discard) && + !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && + bucket_flushed(new_a)) + bch2_discard_one_bucket_fast(c, new.k->p); if (statechange(a->data_type == BCH_DATA_cached) && !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && @@ -1609,6 +1611,36 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) return ret; } +static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket) +{ + int ret; + + mutex_lock(&c->discard_buckets_in_flight_lock); + darray_for_each(c->discard_buckets_in_flight, i) + if (bkey_eq(*i, bucket)) { + ret = -EEXIST; + goto out; + } + + ret = darray_push(&c->discard_buckets_in_flight, bucket); +out: + mutex_unlock(&c->discard_buckets_in_flight_lock); + return ret; +} + +static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket) +{ + mutex_lock(&c->discard_buckets_in_flight_lock); + darray_for_each(c->discard_buckets_in_flight, i) + if (bkey_eq(*i, bucket)) { + darray_remove_item(&c->discard_buckets_in_flight, i); + goto found; + } + BUG(); +found: + mutex_unlock(&c->discard_buckets_in_flight_lock); +} + struct discard_buckets_state { u64 seen; u64 open; @@ -1647,6 +1679,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, struct bch_dev *ca; struct bkey_i_alloc_v4 *a; struct printbuf buf = PRINTBUF; + bool discard_locked = false; int ret = 0; ca = bch_dev_bkey_exists(c, pos.inode); @@ -1714,6 +1747,11 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, goto out; } + if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true))) + goto out; + + discard_locked = true; + if (!bkey_eq(*discard_pos_done, iter.pos) && ca->mi.discard && !c->opts.nochanges) { /* @@ -1745,6 +1783,8 @@ write: count_event(c, bucket_discard); s->discarded++; out: + if (discard_locked) + discard_in_flight_remove(c, iter.pos); s->seen++; bch2_trans_iter_exit(trans, &iter); percpu_ref_put(&ca->io_ref); @@ -1784,6 +1824,93 @@ void bch2_do_discards(struct bch_fs *c) bch2_write_ref_put(c, BCH_WRITE_REF_discard); } +static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket) +{ + struct btree_iter iter; + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_INTENT); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); + if (ret) + goto err; + + struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + goto err; + + SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); + a->v.data_type = alloc_data_type(a->v, a->v.data_type); + + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static void bch2_do_discards_fast_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work); + + while (1) { + bool got_bucket = false; + struct bpos bucket; + struct bch_dev *ca; + + mutex_lock(&c->discard_buckets_in_flight_lock); + darray_for_each(c->discard_buckets_in_flight, i) { + if (i->snapshot) + continue; + + ca = bch_dev_bkey_exists(c, i->inode); + + if (!percpu_ref_tryget(&ca->io_ref)) { + darray_remove_item(&c->discard_buckets_in_flight, i); + continue; + } + + got_bucket = true; + bucket = *i; + i->snapshot = true; + break; + } + mutex_unlock(&c->discard_buckets_in_flight_lock); + + if (!got_bucket) + break; + + if (ca->mi.discard && !c->opts.nochanges) + blkdev_issue_discard(ca->disk_sb.bdev, + bucket.offset * ca->mi.bucket_size, + ca->mi.bucket_size, + GFP_KERNEL); + + int ret = bch2_trans_do(c, NULL, NULL, + BCH_WATERMARK_btree| + BCH_TRANS_COMMIT_no_enospc, + bch2_clear_bucket_needs_discard(trans, bucket)); + bch_err_fn(c, ret); + + percpu_ref_put(&ca->io_ref); + discard_in_flight_remove(c, bucket); + + if (ret) + break; + } + + bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); +} + +static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); + + if (!percpu_ref_is_dying(&ca->io_ref) && + !discard_in_flight_add(c, bucket) && + bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) && + !queue_work(c->write_ref_wq, &c->discard_fast_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); +} + static int invalidate_one_bucket(struct btree_trans *trans, struct btree_iter *lru_iter, struct bkey_s_c lru_k, @@ -2215,9 +2342,16 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) set_bit(ca->dev_idx, c->rw_devs[i].d); } +void bch2_fs_allocator_background_exit(struct bch_fs *c) +{ + darray_exit(&c->discard_buckets_in_flight); +} + void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); + mutex_init(&c->discard_buckets_in_flight_lock); INIT_WORK(&c->discard_work, bch2_do_discards_work); + INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work); INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work); } diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index e7f7e842ee1b..052b2fac25d6 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -269,6 +269,7 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *); void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); +void bch2_fs_allocator_background_exit(struct bch_fs *); void bch2_fs_allocator_background_init(struct bch_fs *); #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index ca80ecf3b8bc..9f25123b1ae8 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -707,6 +707,7 @@ struct btree_trans_buf { x(reflink) \ x(fallocate) \ x(discard) \ + x(discard_fast) \ x(invalidate) \ x(delete_dead_snapshots) \ x(snapshot_delete_pagecache) \ @@ -944,8 +945,11 @@ struct bch_fs { unsigned write_points_nr; struct buckets_waiting_for_journal buckets_waiting_for_journal; - struct work_struct discard_work; struct work_struct invalidate_work; + struct work_struct discard_work; + struct mutex discard_buckets_in_flight_lock; + DARRAY(struct bpos) discard_buckets_in_flight; + struct work_struct discard_fast_work; /* GARBAGE COLLECTION */ struct task_struct *gc_thread; -- cgit v1.2.3 From d9290c9931e4c2ff27b13209610521681418e194 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 17 Feb 2024 17:54:39 -0500 Subject: bcachefs: Fix journal_buf bitfield accesses All jounal_buf bitfield updates must happen under the journal lock - perhaps we should just switch these to atomic bit flags. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_write_buffer.c | 2 ++ fs/bcachefs/journal_io.c | 17 +++++++++++------ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index ac7844861966..a7d86252690a 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -590,7 +590,9 @@ static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_bu entry->type = BCH_JSET_ENTRY_btree_keys; } + spin_lock(&c->journal.lock); buf->need_flush_to_write_buffer = false; + spin_unlock(&c->journal.lock); out: bch2_journal_keys_to_write_buffer_end(c, &dst); return ret; diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index c4f19b996b45..3aff628e05ca 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1831,7 +1831,10 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) if (wb.wb) bch2_journal_keys_to_write_buffer_end(c, &wb); + + spin_lock(&c->journal.lock); w->need_flush_to_write_buffer = false; + spin_unlock(&c->journal.lock); start = end = vstruct_last(jset); @@ -1949,12 +1952,20 @@ CLOSURE_CALLBACK(bch2_journal_write) unsigned nr_rw_members = 0; int ret; + for_each_rw_member(c, ca) + nr_rw_members++; + BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); + BUG_ON(!w->write_started); BUG_ON(w->write_allocated); + BUG_ON(w->write_done); j->write_start_time = local_clock(); spin_lock(&j->lock); + if (nr_rw_members > 1) + w->separate_flush = true; + ret = bch2_journal_write_pick_flush(j, w); spin_unlock(&j->lock); if (ret) @@ -2009,12 +2020,6 @@ CLOSURE_CALLBACK(bch2_journal_write) if (c->opts.nochanges) goto no_io; - for_each_rw_member(c, ca) - nr_rw_members++; - - if (nr_rw_members > 1) - w->separate_flush = true; - /* * Mark journal replicas before we submit the write to guarantee * recovery will find the journal entries after a crash. -- cgit v1.2.3 From 90aa35c4c908c04901a8dc5f8db9b93b919a4084 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 17 Feb 2024 19:56:19 -0500 Subject: bcachefs: Add journal.blocked to journal_debug_to_text() Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 597733a84d38..2e714a32fc7e 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1436,6 +1436,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked); prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); + prt_printf(out, "blocked:\t\t%u\n", j->blocked); prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); prt_printf(out, "current entry:\t\t"); -- cgit v1.2.3 From c7cad231e83606a3ad627d5c4503272435c962a4 Mon Sep 17 00:00:00 2001 From: Calvin Owens Date: Sun, 18 Feb 2024 23:36:08 -0800 Subject: bcachefs: Silence gcc warnings about arm arch ABI drift MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 32-bit arm builds emit a lot of spam like this: fs/bcachefs/backpointers.c: In function ‘extent_matches_bp’: fs/bcachefs/backpointers.c:15:13: note: parameter passing for argument of type ‘struct bch_backpointer’ changed in GCC 9.1 Apply the change from commit ebcc5928c5d9 ("arm64: Silence gcc warnings about arch ABI drift") to fs/bcachefs/ to silence them. Signed-off-by: Calvin Owens Signed-off-by: Kent Overstreet --- fs/bcachefs/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 1a05cecda7cc..3433959d4f35 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -90,3 +90,6 @@ bcachefs-y := \ xattr.o obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o + +# Silence "note: xyz changed in GCC X.X" messages +subdir-ccflags-y += $(call cc-disable-warning, psabi) -- cgit v1.2.3 From 150194cdcb6b4305be41cd8af7a42dd2d1457ae1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 21 Feb 2024 11:52:03 +0000 Subject: bcachefs: remove redundant assignment to variable ret Variable ret is being assigned a value that is never read, it is being re-assigned a couple of statements later on. The assignment is redundant and can be removed. Cleans up clang scan build warning: fs/bcachefs/super-io.c:806:2: warning: Value stored to 'ret' is never read [deadcode.DeadStores] Signed-off-by: Colin Ian King Signed-off-by: Kent Overstreet --- fs/bcachefs/super-io.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index a3a9e85ab03c..38a5073202c5 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -804,7 +804,6 @@ got_super: goto err; } - ret = 0; sb->have_layout = true; ret = bch2_sb_validate(sb, &err, READ); -- cgit v1.2.3 From eb386617be4bdfe02eb0972874f726e2bfc7a6e7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 21 Feb 2024 22:10:09 -0500 Subject: bcachefs: Errcode tracepoint, documentation Add a tracepoint for downcasting private errors to standard errors, so they can be recovered even when not logged; also, add some documentation. Signed-off-by: Kent Overstreet --- Documentation/filesystems/bcachefs/errorcodes.rst | 30 +++++++++++++++++++++++ fs/bcachefs/errcode.c | 15 +++++++----- fs/bcachefs/errcode.h | 1 + fs/bcachefs/trace.h | 19 ++++++++++++++ 4 files changed, 59 insertions(+), 6 deletions(-) create mode 100644 Documentation/filesystems/bcachefs/errorcodes.rst diff --git a/Documentation/filesystems/bcachefs/errorcodes.rst b/Documentation/filesystems/bcachefs/errorcodes.rst new file mode 100644 index 000000000000..2cccaa0ba7cd --- /dev/null +++ b/Documentation/filesystems/bcachefs/errorcodes.rst @@ -0,0 +1,30 @@ +.. SPDX-License-Identifier: GPL-2.0 + +bcachefs private error codes +---------------------------- + +In bcachefs, as a hard rule we do not throw or directly use standard error +codes (-EINVAL, -EBUSY, etc.). Instead, we define private error codes as needed +in fs/bcachefs/errcode.h. + +This gives us much better error messages and makes debugging much easier. Any +direct uses of standard error codes you see in the source code are simply old +code that has yet to be converted - feel free to clean it up! + +Private error codes may subtype another error code, this allows for grouping of +related errors that should be handled similarly (e.g. transaction restart +errors), as well as specifying which standard error code should be returned at +the bcachefs module boundary. + +At the module boundary, we use bch2_err_class() to convert to a standard error +code; this also emits a trace event so that the original error code be +recovered even if it wasn't logged. + +Do not reuse error codes! Generally speaking, a private error code should only +be thrown in one place. That means that when we see it in a log message we can +see, unambiguously, exactly which file and line number it was returned from. + +Try to give error codes names that are as reasonably descriptive of the error +as possible. Frequently, the error will be logged at a place far removed from +where the error was generated; good names for error codes mean much more +descriptive and useful error messages. diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c index d260ff9bbfeb..43557bebd0f8 100644 --- a/fs/bcachefs/errcode.c +++ b/fs/bcachefs/errcode.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "errcode.h" +#include "trace.h" #include @@ -49,15 +50,17 @@ bool __bch2_err_matches(int err, int class) return err == class; } -int __bch2_err_class(int err) +int __bch2_err_class(int bch_err) { - err = -err; - BUG_ON((unsigned) err >= BCH_ERR_MAX); + int std_err = -bch_err; + BUG_ON((unsigned) std_err >= BCH_ERR_MAX); - while (err >= BCH_ERR_START && bch2_errcode_parents[err - BCH_ERR_START]) - err = bch2_errcode_parents[err - BCH_ERR_START]; + while (std_err >= BCH_ERR_START && bch2_errcode_parents[std_err - BCH_ERR_START]) + std_err = bch2_errcode_parents[std_err - BCH_ERR_START]; + + trace_error_downcast(bch_err, std_err, _RET_IP_); - return -err; + return -std_err; } const char *bch2_blk_status_to_str(blk_status_t status) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index fe3fc14d3c9a..e960a6eae66a 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -78,6 +78,7 @@ x(ENOMEM, ENOMEM_fs_name_alloc) \ x(ENOMEM, ENOMEM_fs_other_alloc) \ x(ENOMEM, ENOMEM_dev_alloc) \ + x(ENOMEM, ENOMEM_disk_accounting) \ x(ENOSPC, ENOSPC_disk_reservation) \ x(ENOSPC, ENOSPC_bucket_alloc) \ x(ENOSPC, ENOSPC_disk_label_add) \ diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 293b90d704fb..6aa81d1e6d36 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -1431,6 +1431,25 @@ DEFINE_EVENT(fs_str, data_update, TP_ARGS(c, str) ); +TRACE_EVENT(error_downcast, + TP_PROTO(int bch_err, int std_err, unsigned long ip), + TP_ARGS(bch_err, std_err, ip), + + TP_STRUCT__entry( + __array(char, bch_err, 32 ) + __array(char, std_err, 32 ) + __array(char, ip, 32 ) + ), + + TP_fast_assign( + strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err)); + strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err)); + snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); + ), + + TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip) +); + #endif /* _TRACE_BCACHEFS_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 0b5961b0d8a499100bd90a70075a23f875f1ddf6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 23 Feb 2024 22:46:35 -0500 Subject: bcachefs: jset_entry for loops declare loop iter Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_journal_iter.c | 2 -- fs/bcachefs/btree_write_buffer.c | 2 -- fs/bcachefs/journal_io.c | 3 --- fs/bcachefs/journal_io.h | 4 ++-- 4 files changed, 2 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 3da65562fdb0..eee0dcaf4c13 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -510,8 +510,6 @@ int bch2_journal_keys_sort(struct bch_fs *c) { struct genradix_iter iter; struct journal_replay *i, **_i; - struct jset_entry *entry; - struct bkey_i *k; struct journal_keys *keys = &c->journal_keys; size_t nr_keys = 0, nr_read = 0; diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index a7d86252690a..b77e7b382b66 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -574,8 +574,6 @@ void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) { struct journal_keys_to_wb dst; - struct jset_entry *entry; - struct bkey_i *k; int ret = 0; bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 3aff628e05ca..1c2edfc5448f 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -40,7 +40,6 @@ static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, bch2_journal_ptrs_to_text(out, c, j); - struct jset_entry *entry; for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) { struct jset_entry_datetime *datetime = container_of(entry, struct jset_entry_datetime, entry); @@ -394,7 +393,6 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c, static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, struct jset_entry *entry) { - struct bkey_i *k; bool first = true; jset_entry_for_each_key(entry, k) { @@ -1815,7 +1813,6 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) if (!wb.wb) bch2_journal_keys_to_write_buffer_start(c, &wb, seq); - struct bkey_i *k; jset_entry_for_each_key(i, k) { ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); if (ret) { diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index 1f395f43cf76..4d66c77a35e0 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -39,12 +39,12 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, } #define for_each_jset_entry_type(entry, jset, type) \ - for (entry = (jset)->start; \ + for (struct jset_entry *entry = (jset)->start; \ (entry = __jset_entry_type_next(jset, entry, type)); \ entry = vstruct_next(entry)) #define jset_entry_for_each_key(_e, _k) \ - for (_k = (_e)->start; \ + for (struct bkey_i *_k = (_e)->start; \ _k < vstruct_last(_e); \ _k = bkey_next(_k)) -- cgit v1.2.3 From 894d062254e29dd3e99a55029de5d976b1681eb3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 24 Feb 2024 00:15:56 -0500 Subject: bcachefs: Rename journal_keys.d -> journal_keys.data This will let us use some darray helpers in the next patch. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 6 ++-- fs/bcachefs/btree_journal_iter.c | 68 ++++++++++++++++++++-------------------- fs/bcachefs/recovery.c | 10 +++--- 3 files changed, 42 insertions(+), 42 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 9f25123b1ae8..af7c798945f1 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -667,6 +667,8 @@ struct journal_seq_blacklist_table { }; struct journal_keys { + /* must match layout in darray_types.h */ + size_t nr, size; struct journal_key { u64 journal_seq; u32 journal_offset; @@ -675,15 +677,13 @@ struct journal_keys { bool allocated; bool overwritten; struct bkey_i *k; - } *d; + } *data; /* * Gap buffer: instead of all the empty space in the array being at the * end of the buffer - from @nr to @size - the empty space is at @gap. * This means that sequential insertions are O(n) instead of O(n^2). */ size_t gap; - size_t nr; - size_t size; atomic_t ref; bool initial_ref_held; }; diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index eee0dcaf4c13..db54655b9382 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -42,7 +42,7 @@ static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) { - return keys->d + idx_to_pos(keys, idx); + return keys->data + idx_to_pos(keys, idx); } static size_t __bch2_journal_key_search(struct journal_keys *keys, @@ -182,10 +182,10 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, BUG_ON(test_bit(BCH_FS_rw, &c->flags)); if (idx < keys->size && - journal_key_cmp(&n, &keys->d[idx]) == 0) { - if (keys->d[idx].allocated) - kfree(keys->d[idx].k); - keys->d[idx] = n; + journal_key_cmp(&n, &keys->data[idx]) == 0) { + if (keys->data[idx].allocated) + kfree(keys->data[idx].k); + keys->data[idx] = n; return 0; } @@ -198,17 +198,17 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, .size = max_t(size_t, keys->size, 8) * 2, }; - new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL); - if (!new_keys.d) { + new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL); + if (!new_keys.data) { bch_err(c, "%s: error allocating new key array (size %zu)", __func__, new_keys.size); return -BCH_ERR_ENOMEM_journal_key_insert; } /* Since @keys was full, there was no gap: */ - memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); - kvfree(keys->d); - keys->d = new_keys.d; + memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr); + kvfree(keys->data); + keys->data = new_keys.data; keys->nr = new_keys.nr; keys->size = new_keys.size; @@ -218,11 +218,11 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, journal_iters_move_gap(c, keys->gap, idx); - move_gap(keys->d, keys->nr, keys->size, keys->gap, idx); + move_gap(keys->data, keys->nr, keys->size, keys->gap, idx); keys->gap = idx; keys->nr++; - keys->d[keys->gap++] = n; + keys->data[keys->gap++] = n; journal_iters_fix(c); @@ -269,10 +269,10 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, size_t idx = bch2_journal_key_search(keys, btree, level, pos); if (idx < keys->size && - keys->d[idx].btree_id == btree && - keys->d[idx].level == level && - bpos_eq(keys->d[idx].k->k.p, pos)) - keys->d[idx].overwritten = true; + keys->data[idx].btree_id == btree && + keys->data[idx].level == level && + bpos_eq(keys->data[idx].k->k.p, pos)) + keys->data[idx].overwritten = true; } static void bch2_journal_iter_advance(struct journal_iter *iter) @@ -286,16 +286,16 @@ static void bch2_journal_iter_advance(struct journal_iter *iter) static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) { - struct journal_key *k = iter->keys->d + iter->idx; + struct journal_key *k = iter->keys->data + iter->idx; - while (k < iter->keys->d + iter->keys->size && + while (k < iter->keys->data + iter->keys->size && k->btree_id == iter->btree_id && k->level == iter->level) { if (!k->overwritten) return bkey_i_to_s_c(k->k); bch2_journal_iter_advance(iter); - k = iter->keys->d + iter->idx; + k = iter->keys->data + iter->idx; } return bkey_s_c_null; @@ -474,15 +474,15 @@ void bch2_journal_keys_put(struct bch_fs *c) if (!atomic_dec_and_test(&keys->ref)) return; - move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); + move_gap(keys->data, keys->nr, keys->size, keys->gap, keys->nr); keys->gap = keys->nr; - for (i = keys->d; i < keys->d + keys->nr; i++) + for (i = keys->data; i < keys->data + keys->nr; i++) if (i->allocated) kfree(i->k); - kvfree(keys->d); - keys->d = NULL; + kvfree(keys->data); + keys->data = NULL; keys->nr = keys->gap = keys->size = 0; bch2_journal_entries_free(c); @@ -492,18 +492,18 @@ static void __journal_keys_sort(struct journal_keys *keys) { struct journal_key *src, *dst; - sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL); + sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL); - src = dst = keys->d; - while (src < keys->d + keys->nr) { - while (src + 1 < keys->d + keys->nr && + src = dst = keys->data; + while (src < keys->data + keys->nr) { + while (src + 1 < keys->data + keys->nr && !journal_key_cmp(src, src + 1)) src++; *dst++ = *src++; } - keys->nr = dst - keys->d; + keys->nr = dst - keys->data; } int bch2_journal_keys_sort(struct bch_fs *c) @@ -528,17 +528,17 @@ int bch2_journal_keys_sort(struct bch_fs *c) keys->size = roundup_pow_of_two(nr_keys); - keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); - if (!keys->d) { + keys->data = kvmalloc_array(keys->size, sizeof(keys->data[0]), GFP_KERNEL); + if (!keys->data) { bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath", nr_keys); do { keys->size >>= 1; - keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); - } while (!keys->d && keys->size > nr_keys / 8); + keys->data = kvmalloc_array(keys->size, sizeof(keys->data[0]), GFP_KERNEL); + } while (!keys->data && keys->size > nr_keys / 8); - if (!keys->d) { + if (!keys->data) { bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting", keys->size); return -BCH_ERR_ENOMEM_journal_keys_sort; @@ -564,7 +564,7 @@ int bch2_journal_keys_sort(struct bch_fs *c) } } - keys->d[keys->nr++] = (struct journal_key) { + keys->data[keys->nr++] = (struct journal_key) { .btree_id = entry->btree_id, .level = entry->level, .k = k, diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 83f95d22fa81..0aac62a7fe19 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -57,8 +57,8 @@ static void drop_alloc_keys(struct journal_keys *keys) size_t src, dst; for (src = 0, dst = 0; src < keys->nr; src++) - if (!btree_id_is_alloc(keys->d[src].btree_id)) - keys->d[dst++] = keys->d[src]; + if (!btree_id_is_alloc(keys->data[src].btree_id)) + keys->data[dst++] = keys->data[src]; keys->nr = dst; } @@ -72,7 +72,7 @@ static void zero_out_btree_mem_ptr(struct journal_keys *keys) { struct journal_key *i; - for (i = keys->d; i < keys->d + keys->nr; i++) + for (i = keys->data; i < keys->data + keys->nr; i++) if (i->k->k.type == KEY_TYPE_btree_ptr_v2) bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0; } @@ -180,7 +180,7 @@ static int bch2_journal_replay(struct bch_fs *c) for (size_t i = 0; i < keys->nr; i++) { cond_resched(); - struct journal_key *k = keys->d + i; + struct journal_key *k = keys->data + i; /* Skip fastpath if we're low on space in the journal */ ret = c->journal.watermark ? -1 : @@ -535,7 +535,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c) * setting journal_key->overwritten: it will be accessed by multiple * threads */ - move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); + move_gap(keys->data, keys->nr, keys->size, keys->gap, keys->nr); keys->gap = keys->nr; set_bit(BCH_FS_may_go_rw, &c->flags); -- cgit v1.2.3 From 95ffc7fb8c7831ee79ed8d2c0e53c7b4869c6338 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 24 Feb 2024 00:19:09 -0500 Subject: bcachefs: journal_keys now uses darray helpers nice bit of code cleanup Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_journal_iter.c | 78 ++++++++++++---------------------------- fs/bcachefs/recovery.c | 8 ++--- 2 files changed, 25 insertions(+), 61 deletions(-) diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index db54655b9382..9291a897abab 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -467,7 +467,6 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) void bch2_journal_keys_put(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; - struct journal_key *i; BUG_ON(atomic_read(&keys->ref) <= 0); @@ -477,7 +476,7 @@ void bch2_journal_keys_put(struct bch_fs *c) move_gap(keys->data, keys->nr, keys->size, keys->gap, keys->nr); keys->gap = keys->nr; - for (i = keys->data; i < keys->data + keys->nr; i++) + darray_for_each(*keys, i) if (i->allocated) kfree(i->k); @@ -490,17 +489,16 @@ void bch2_journal_keys_put(struct bch_fs *c) static void __journal_keys_sort(struct journal_keys *keys) { - struct journal_key *src, *dst; - sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL); - src = dst = keys->data; - while (src < keys->data + keys->nr) { - while (src + 1 < keys->data + keys->nr && - !journal_key_cmp(src, src + 1)) - src++; + struct journal_key *dst = keys->data; + + darray_for_each(*keys, src) { + if (src + 1 < &darray_top(*keys) && + !journal_key_cmp(src, src + 1)) + continue; - *dst++ = *src++; + *dst++ = *src; } keys->nr = dst - keys->data; @@ -511,39 +509,7 @@ int bch2_journal_keys_sort(struct bch_fs *c) struct genradix_iter iter; struct journal_replay *i, **_i; struct journal_keys *keys = &c->journal_keys; - size_t nr_keys = 0, nr_read = 0; - - genradix_for_each(&c->journal_entries, iter, _i) { - i = *_i; - - if (!i || i->ignore) - continue; - - for_each_jset_key(k, entry, &i->j) - nr_keys++; - } - - if (!nr_keys) - return 0; - - keys->size = roundup_pow_of_two(nr_keys); - - keys->data = kvmalloc_array(keys->size, sizeof(keys->data[0]), GFP_KERNEL); - if (!keys->data) { - bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath", - nr_keys); - - do { - keys->size >>= 1; - keys->data = kvmalloc_array(keys->size, sizeof(keys->data[0]), GFP_KERNEL); - } while (!keys->data && keys->size > nr_keys / 8); - - if (!keys->data) { - bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting", - keys->size); - return -BCH_ERR_ENOMEM_journal_keys_sort; - } - } + size_t nr_read = 0; genradix_for_each(&c->journal_entries, iter, _i) { i = *_i; @@ -554,17 +520,7 @@ int bch2_journal_keys_sort(struct bch_fs *c) cond_resched(); for_each_jset_key(k, entry, &i->j) { - if (keys->nr == keys->size) { - __journal_keys_sort(keys); - - if (keys->nr > keys->size * 7 / 8) { - bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu", - keys->nr, keys->size, nr_read, nr_keys); - return -BCH_ERR_ENOMEM_journal_keys_sort; - } - } - - keys->data[keys->nr++] = (struct journal_key) { + struct journal_key n = (struct journal_key) { .btree_id = entry->btree_id, .level = entry->level, .k = k, @@ -572,6 +528,18 @@ int bch2_journal_keys_sort(struct bch_fs *c) .journal_offset = k->_data - i->j._data, }; + if (darray_push(keys, n)) { + __journal_keys_sort(keys); + + if (keys->nr * 8 > keys->size * 7) { + bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", + keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); + return -BCH_ERR_ENOMEM_journal_keys_sort; + } + + BUG_ON(darray_push(keys, n)); + } + nr_read++; } } @@ -579,6 +547,6 @@ int bch2_journal_keys_sort(struct bch_fs *c) __journal_keys_sort(keys); keys->gap = keys->nr; - bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr); + bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr); return 0; } diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 0aac62a7fe19..d773af23a189 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -70,9 +70,7 @@ static void drop_alloc_keys(struct journal_keys *keys) */ static void zero_out_btree_mem_ptr(struct journal_keys *keys) { - struct journal_key *i; - - for (i = keys->data; i < keys->data + keys->nr; i++) + darray_for_each(*keys, i) if (i->k->k.type == KEY_TYPE_btree_ptr_v2) bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0; } @@ -177,11 +175,9 @@ static int bch2_journal_replay(struct bch_fs *c) * efficient - better locality of btree access - but some might fail if * that would cause a journal deadlock. */ - for (size_t i = 0; i < keys->nr; i++) { + darray_for_each(*keys, k) { cond_resched(); - struct journal_key *k = keys->data + i; - /* Skip fastpath if we're low on space in the journal */ ret = c->journal.watermark ? -1 : commit_do(trans, NULL, NULL, -- cgit v1.2.3 From 69426613cdf0784e29e1a429c1a2f372a6267c43 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 23 Feb 2024 22:43:24 -0500 Subject: bcachefs: improve move_gap() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_journal_iter.c | 6 ++---- fs/bcachefs/recovery.c | 3 +-- fs/bcachefs/util.h | 8 ++++++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 9291a897abab..207dd32e2ecc 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -218,8 +218,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, journal_iters_move_gap(c, keys->gap, idx); - move_gap(keys->data, keys->nr, keys->size, keys->gap, idx); - keys->gap = idx; + move_gap(keys, idx); keys->nr++; keys->data[keys->gap++] = n; @@ -473,8 +472,7 @@ void bch2_journal_keys_put(struct bch_fs *c) if (!atomic_dec_and_test(&keys->ref)) return; - move_gap(keys->data, keys->nr, keys->size, keys->gap, keys->nr); - keys->gap = keys->nr; + move_gap(keys, keys->nr); darray_for_each(*keys, i) if (i->allocated) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index d773af23a189..f11e882de02b 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -531,8 +531,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c) * setting journal_key->overwritten: it will be accessed by multiple * threads */ - move_gap(keys->data, keys->nr, keys->size, keys->gap, keys->nr); - keys->gap = keys->nr; + move_gap(keys, keys->nr); set_bit(BCH_FS_may_go_rw, &c->flags); diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 7fed75c44cd5..b3d3a7349814 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -756,8 +756,12 @@ static inline void __move_gap(void *array, size_t element_size, } /* Move the gap in a gap buffer: */ -#define move_gap(_array, _nr, _size, _old_gap, _new_gap) \ - __move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap) +#define move_gap(_d, _new_gap) \ +do { \ + __move_gap((_d)->data, sizeof((_d)->data[0]), \ + (_d)->nr, (_d)->size, (_d)->gap, _new_gap); \ + (_d)->gap = _new_gap; \ +} while (0) #define bubble_sort(_base, _nr, _cmp) \ do { \ -- cgit v1.2.3 From 2cce3752cec5a895030c8aa534cef3f493145a8e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 25 Feb 2024 18:48:21 -0500 Subject: bcachefs: split out ignore_blacklisted, ignore_not_dirty prep work for replaying the journal backwards Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_journal_iter.c | 2 +- fs/bcachefs/journal.c | 4 ++-- fs/bcachefs/journal_io.c | 33 +++++++++++++++++++-------------- fs/bcachefs/journal_io.h | 8 +++++++- fs/bcachefs/recovery.c | 7 ++++--- 5 files changed, 33 insertions(+), 21 deletions(-) diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 207dd32e2ecc..50e04356d72c 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -512,7 +512,7 @@ int bch2_journal_keys_sort(struct bch_fs *c) genradix_for_each(&c->journal_entries, iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; cond_resched(); diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 2e714a32fc7e..95fd84c08c8c 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1204,7 +1204,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) genradix_for_each_reverse(&c->journal_entries, iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; last_seq = le64_to_cpu(i->j.last_seq); @@ -1237,7 +1237,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) genradix_for_each(&c->journal_entries, iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; seq = le64_to_cpu(i->j.seq); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 1c2edfc5448f..34af25f286aa 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -86,9 +86,12 @@ static void __journal_replay_free(struct bch_fs *c, kvfree(i); } -static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) +static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted) { - i->ignore = true; + if (blacklisted) + i->ignore_blacklisted = true; + else + i->ignore_not_dirty = true; if (!c->opts.read_entire_journal) __journal_replay_free(c, i); @@ -138,12 +141,13 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, journal_entry_radix_idx(c, jlist->last_seq)) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; if (le64_to_cpu(i->j.seq) >= last_seq) break; - journal_replay_free(c, i); + + journal_replay_free(c, i, false); } } @@ -199,8 +203,9 @@ replace: return -BCH_ERR_ENOMEM_journal_entry_add; darray_init(&i->ptrs); - i->csum_good = entry_ptr.csum_good; - i->ignore = false; + i->csum_good = entry_ptr.csum_good; + i->ignore_blacklisted = false; + i->ignore_not_dirty = false; unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); if (dup) { @@ -1255,20 +1260,20 @@ int bch2_journal_read(struct bch_fs *c, i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; if (!*start_seq) *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; if (JSET_NO_FLUSH(&i->j)) { - i->ignore = true; + i->ignore_blacklisted = true; continue; } if (!last_write_torn && !i->csum_good) { last_write_torn = true; - i->ignore = true; + i->ignore_blacklisted = true; continue; } @@ -1307,12 +1312,12 @@ int bch2_journal_read(struct bch_fs *c, genradix_for_each(&c->journal_entries, radix_iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; seq = le64_to_cpu(i->j.seq); if (seq < *last_seq) { - journal_replay_free(c, i); + journal_replay_free(c, i, false); continue; } @@ -1320,7 +1325,7 @@ int bch2_journal_read(struct bch_fs *c, fsck_err_on(!JSET_NO_FLUSH(&i->j), c, jset_seq_blacklisted, "found blacklisted journal entry %llu", seq); - i->ignore = true; + i->ignore_blacklisted = true; } } @@ -1329,7 +1334,7 @@ int bch2_journal_read(struct bch_fs *c, genradix_for_each(&c->journal_entries, radix_iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; BUG_ON(seq > le64_to_cpu(i->j.seq)); @@ -1382,7 +1387,7 @@ int bch2_journal_read(struct bch_fs *c, }; i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; darray_for_each(i->ptrs, ptr) { diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index 4d66c77a35e0..4f1e763ab506 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -20,11 +20,17 @@ struct journal_replay { DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs; bool csum_good; - bool ignore; + bool ignore_blacklisted; + bool ignore_not_dirty; /* must be last: */ struct jset j; }; +static inline bool journal_replay_ignore(struct journal_replay *i) +{ + return !i || i->ignore_blacklisted || i->ignore_not_dirty; +} + static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, struct jset_entry *entry, unsigned type) { diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index f11e882de02b..0c579ba8de78 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -366,7 +366,7 @@ static int journal_replay_early(struct bch_fs *c, genradix_for_each(&c->journal_entries, iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; vstruct_for_each(&i->j, entry) { @@ -868,7 +868,7 @@ int bch2_fs_recovery(struct bch_fs *c) goto out; genradix_for_each_reverse(&c->journal_entries, iter, i) - if (*i && !(*i)->ignore) { + if (!journal_replay_ignore(*i)) { last_journal_entry = &(*i)->j; break; } @@ -893,7 +893,8 @@ int bch2_fs_recovery(struct bch_fs *c) genradix_for_each_reverse(&c->journal_entries, iter, i) if (*i) { last_journal_entry = &(*i)->j; - (*i)->ignore = false; + (*i)->ignore_blacklisted = false; + (*i)->ignore_not_dirty= false; /* * This was probably a NO_FLUSH entry, * so last_seq was garbage - but we know -- cgit v1.2.3 From 79162e829b5e5859409736b19daa2f6d2d1e0b59 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Mon, 19 Feb 2024 20:24:32 +0800 Subject: bcachefs: fix the error code when mounting with incorrect options. When mount with incorrect options such as: "mount -t bcachefs -o errors=back /dev/loop1 /mnt/bcachefs/". It rebacks the error "mount: /mnt/bcachefs: permission denied." cause bch2_parse_mount_opts returns -1 and bch2_mount throws it up. This is unreasonable. The real error message should be like this: "mount: /mnt/bcachefs: wrong fs type, bad option, bad superblock on /dev/loop1, missing codepage or helper program, or other error." Adding three private error codes for mounting error. Here are: - BCH_ERR_mount_option as the parent class for option error. - BCH_ERR_option_name represents the invalid option name. - BCH_ERR_option_value represents the invalid option value. Signed-off-by: Hongbo Li Signed-off-by: Kent Overstreet --- fs/bcachefs/errcode.h | 3 +++ fs/bcachefs/fs.c | 4 +++- fs/bcachefs/opts.c | 6 +++--- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index e960a6eae66a..a8f1a3976305 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -5,6 +5,9 @@ #define BCH_ERRCODES() \ x(ERANGE, ERANGE_option_too_small) \ x(ERANGE, ERANGE_option_too_big) \ + x(EINVAL, mount_option) \ + x(BCH_ERR_mount_option, option_name) \ + x(BCH_ERR_mount_option, option_value) \ x(ENOMEM, ENOMEM_stripe_buf) \ x(ENOMEM, ENOMEM_replicas_table) \ x(ENOMEM, ENOMEM_cpu_replicas) \ diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 093f5404a655..3f073845bbd7 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1870,8 +1870,10 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, opt_set(opts, read_only, (flags & SB_RDONLY) != 0); ret = bch2_parse_mount_opts(NULL, &opts, data); - if (ret) + if (ret) { + ret = bch2_err_class(ret); return ERR_PTR(ret); + } if (!dev_name || strlen(dev_name) == 0) return ERR_PTR(-EINVAL); diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index b1ed0b9a20d3..1db11c15b2b9 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -456,7 +456,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, copied_opts = kstrdup(options, GFP_KERNEL); if (!copied_opts) - return -1; + return -ENOMEM; copied_opts_start = copied_opts; while ((opt = strsep(&copied_opts, ",")) != NULL) { @@ -501,11 +501,11 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, bad_opt: pr_err("Bad mount option %s", name); - ret = -1; + ret = -BCH_ERR_option_name; goto out; bad_val: pr_err("Invalid mount option %s", err.buf); - ret = -1; + ret = -BCH_ERR_option_value; goto out; out: kfree(copied_opts_start); -- cgit v1.2.3 From 7efa287526f02a54a0a9abf358e15623c5b09f93 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 28 Feb 2024 21:56:57 -0500 Subject: bcachefs: Fix bch2_journal_noflush_seq() Improved journal pipelining broke journal_noflush_seq(); it implicitly assumed only the oldest outstanding journal buf could be in flight, but that's no longer true. Make this more straightforward by just setting buf->must_flush whenever we know a journal buf is going to be flush. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 10 +++++----- fs/bcachefs/journal_io.c | 1 + 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 95fd84c08c8c..9f208fcb9575 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -716,7 +716,7 @@ recheck_need_open: return ret; seq = res.seq; - buf = j->buf + (seq & JOURNAL_BUF_MASK); + buf = journal_seq_to_buf(j, seq); buf->must_flush = true; if (!buf->flush_time) { @@ -734,8 +734,8 @@ recheck_need_open: } /* - * if write was kicked off without a flush, flush the next sequence - * number instead + * if write was kicked off without a flush, or if we promised it + * wouldn't be a flush, flush the next sequence number instead */ buf = journal_seq_to_buf(j, seq); if (buf->noflush) { @@ -813,8 +813,8 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq) unwritten_seq++) { struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); - /* journal write is already in flight, and was a flush write: */ - if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush) + /* journal flush already in flight, or flush requseted */ + if (buf->must_flush) goto out; buf->noflush = true; diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 34af25f286aa..0b49b2383d82 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1936,6 +1936,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * j->nr_noflush_writes++; } else { + w->must_flush = true; j->last_flush_write = jiffies; j->nr_flush_writes++; clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); -- cgit v1.2.3 From 66a67c860cce3643248f7e80ee095b946829a342 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 28 Feb 2024 18:28:48 -0500 Subject: fs: file_remove_privs_flags() Rename and export __file_remove_privs(); for a buffered write path that doesn't take the inode lock we need to be able to check if the operation needs to do work first. Signed-off-by: Kent Overstreet Cc: Alexander Viro Cc: Christian Brauner --- fs/inode.c | 7 ++++--- include/linux/fs.h | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 91048c4c9c9e..b465afdbfcef 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2031,7 +2031,7 @@ static int __remove_privs(struct mnt_idmap *idmap, return notify_change(idmap, dentry, &newattrs, NULL); } -static int __file_remove_privs(struct file *file, unsigned int flags) +int file_remove_privs_flags(struct file *file, unsigned int flags) { struct dentry *dentry = file_dentry(file); struct inode *inode = file_inode(file); @@ -2056,6 +2056,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags) inode_has_no_xattr(inode); return error; } +EXPORT_SYMBOL_GPL(file_remove_privs_flags); /** * file_remove_privs - remove special file privileges (suid, capabilities) @@ -2068,7 +2069,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags) */ int file_remove_privs(struct file *file) { - return __file_remove_privs(file, 0); + return file_remove_privs_flags(file, 0); } EXPORT_SYMBOL(file_remove_privs); @@ -2161,7 +2162,7 @@ static int file_modified_flags(struct file *file, int flags) * Clear the security bits if the process is not being run by root. * This keeps people from modifying setuid and setgid binaries. */ - ret = __file_remove_privs(file, flags); + ret = file_remove_privs_flags(file, flags); if (ret) return ret; diff --git a/include/linux/fs.h b/include/linux/fs.h index 1fbc72c5f112..14ea66b62823 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3004,6 +3004,7 @@ extern struct inode *new_inode_pseudo(struct super_block *sb); extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *); +extern int file_remove_privs_flags(struct file *file, unsigned int flags); extern int file_remove_privs(struct file *); int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode); -- cgit v1.2.3 From 7e64c86cdc6cee0db36eb983c62065c5bf71508b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 28 Feb 2024 18:30:22 -0500 Subject: bcachefs: Buffered write path now can avoid the inode lock Non append, non extending buffered writes can now avoid taking the inode lock. To ensure atomicity of writes w.r.t. other writes, we lock every folio that we'll be writing to, and if this fails we fall back to taking the inode lock. Extensive comments are provided as to corner cases. Link: https://lore.kernel.org/linux-fsdevel/Zdkxfspq3urnrM6I@bombadil.infradead.org/ Signed-off-by: Kent Overstreet --- fs/bcachefs/errcode.h | 3 +- fs/bcachefs/fs-io-buffered.c | 149 +++++++++++++++++++++++++++++++------------ 2 files changed, 111 insertions(+), 41 deletions(-) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index a8f1a3976305..22c7a3322bcc 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -250,7 +250,8 @@ x(BCH_ERR_nopromote, nopromote_congested) \ x(BCH_ERR_nopromote, nopromote_in_flight) \ x(BCH_ERR_nopromote, nopromote_no_writes) \ - x(BCH_ERR_nopromote, nopromote_enomem) + x(BCH_ERR_nopromote, nopromote_enomem) \ + x(0, need_inode_lock) enum bch_errcode { BCH_ERR_START = 2048, diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 27710cdd5710..39292e7ef342 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -810,7 +810,8 @@ static noinline void folios_trunc(folios *fs, struct folio **fi) static int __bch2_buffered_write(struct bch_inode_info *inode, struct address_space *mapping, struct iov_iter *iter, - loff_t pos, unsigned len) + loff_t pos, unsigned len, + bool inode_locked) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_folio_reservation res; @@ -835,6 +836,15 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, BUG_ON(!fs.nr); + /* + * If we're not using the inode lock, we need to lock all the folios for + * atomiticity of writes vs. other writes: + */ + if (!inode_locked && folio_end_pos(darray_last(fs)) < end) { + ret = -BCH_ERR_need_inode_lock; + goto out; + } + f = darray_first(fs); if (pos != folio_pos(f) && !folio_test_uptodate(f)) { ret = bch2_read_single_folio(f, mapping); @@ -929,8 +939,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, end = pos + copied; spin_lock(&inode->v.i_lock); - if (end > inode->v.i_size) + if (end > inode->v.i_size) { + BUG_ON(!inode_locked); i_size_write(&inode->v, end); + } spin_unlock(&inode->v.i_lock); f_pos = pos; @@ -974,12 +986,68 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct bch_inode_info *inode = file_bch_inode(file); - loff_t pos = iocb->ki_pos; - ssize_t written = 0; - int ret = 0; + loff_t pos; + bool inode_locked = false; + ssize_t written = 0, written2 = 0, ret = 0; + + /* + * We don't take the inode lock unless i_size will be changing. Folio + * locks provide exclusion with other writes, and the pagecache add lock + * provides exclusion with truncate and hole punching. + * + * There is one nasty corner case where atomicity would be broken + * without great care: when copying data from userspace to the page + * cache, we do that with faults disable - a page fault would recurse + * back into the filesystem, taking filesystem locks again, and + * deadlock; so it's done with faults disabled, and we fault in the user + * buffer when we aren't holding locks. + * + * If we do part of the write, but we then race and in the userspace + * buffer have been evicted and are no longer resident, then we have to + * drop our folio locks to re-fault them in, breaking write atomicity. + * + * To fix this, we restart the write from the start, if we weren't + * holding the inode lock. + * + * There is another wrinkle after that; if we restart the write from the + * start, and then get an unrecoverable error, we _cannot_ claim to + * userspace that we did not write data we actually did - so we must + * track (written2) the most we ever wrote. + */ + + if ((iocb->ki_flags & IOCB_APPEND) || + (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) { + inode_lock(&inode->v); + inode_locked = true; + } + + ret = generic_write_checks(iocb, iter); + if (ret <= 0) + goto unlock; + + ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0); + if (ret) { + if (!inode_locked) { + inode_lock(&inode->v); + inode_locked = true; + ret = file_remove_privs_flags(file, 0); + } + if (ret) + goto unlock; + } + + ret = file_update_time(file); + if (ret) + goto unlock; + + pos = iocb->ki_pos; bch2_pagecache_add_get(inode); + if (!inode_locked && + (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) + goto get_inode_lock; + do { unsigned offset = pos & (PAGE_SIZE - 1); unsigned bytes = iov_iter_count(iter); @@ -1004,12 +1072,17 @@ again: } } + if (unlikely(bytes != iov_iter_count(iter) && !inode_locked)) + goto get_inode_lock; + if (unlikely(fatal_signal_pending(current))) { ret = -EINTR; break; } - ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); + ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked); + if (ret == -BCH_ERR_need_inode_lock) + goto get_inode_lock; if (unlikely(ret < 0)) break; @@ -1030,50 +1103,46 @@ again: } pos += ret; written += ret; + written2 = max(written, written2); + + if (ret != bytes && !inode_locked) + goto get_inode_lock; ret = 0; balance_dirty_pages_ratelimited(mapping); - } while (iov_iter_count(iter)); + if (0) { +get_inode_lock: + bch2_pagecache_add_put(inode); + inode_lock(&inode->v); + inode_locked = true; + bch2_pagecache_add_get(inode); + + iov_iter_revert(iter, written); + pos -= written; + written = 0; + ret = 0; + } + } while (iov_iter_count(iter)); bch2_pagecache_add_put(inode); +unlock: + if (inode_locked) + inode_unlock(&inode->v); + + iocb->ki_pos += written; - return written ? written : ret; + ret = max(written, written2) ?: ret; + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; } -ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) +ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter) { - struct file *file = iocb->ki_filp; - struct bch_inode_info *inode = file_bch_inode(file); - ssize_t ret; - - if (iocb->ki_flags & IOCB_DIRECT) { - ret = bch2_direct_write(iocb, from); - goto out; - } - - inode_lock(&inode->v); - - ret = generic_write_checks(iocb, from); - if (ret <= 0) - goto unlock; - - ret = file_remove_privs(file); - if (ret) - goto unlock; - - ret = file_update_time(file); - if (ret) - goto unlock; - - ret = bch2_buffered_write(iocb, from); - if (likely(ret > 0)) - iocb->ki_pos += ret; -unlock: - inode_unlock(&inode->v); + ssize_t ret = iocb->ki_flags & IOCB_DIRECT + ? bch2_direct_write(iocb, iter) + : bch2_buffered_write(iocb, iter); - if (ret > 0) - ret = generic_write_sync(iocb, ret); -out: return bch2_err_class(ret); } -- cgit v1.2.3 From 7e23c1746b027ea6075816280a4e463a9399d9da Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Fri, 1 Mar 2024 11:17:45 +0800 Subject: bcachefs: avoid returning private error code in bch2_xattr_bcachefs_set Avoid the private error code return to caller. The error code should be transformed into genernal error code. Signed-off-by: Hongbo Li Signed-off-by: Kent Overstreet --- fs/bcachefs/xattr.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 9c0d2316031b..754f17bba68e 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -544,11 +544,11 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, kfree(buf); if (ret < 0) - return ret; + goto err_class_exit; ret = bch2_opt_check_may_set(c, opt_id, v); if (ret < 0) - return ret; + goto err_class_exit; s.v = v + 1; s.defined = true; @@ -595,6 +595,7 @@ err: (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression)))) bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum); +err_class_exit: return bch2_err_class(ret); } -- cgit v1.2.3 From 2a68d611a17b8135ade5f7016144ef34af0a61a4 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Fri, 1 Mar 2024 14:38:33 +0800 Subject: bcachefs: intercept mountoption value for bool type For mount option with bool type, the value must be 0 or 1 (See bch2_opt_parse). But this seems does not well intercepted cause for other value(like 2...), it returns the unexpect return code with error message printed. Signed-off-by: Hongbo Li Signed-off-by: Kent Overstreet --- fs/bcachefs/errcode.h | 1 + fs/bcachefs/opts.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 22c7a3322bcc..af25d8ec60f2 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -8,6 +8,7 @@ x(EINVAL, mount_option) \ x(BCH_ERR_mount_option, option_name) \ x(BCH_ERR_mount_option, option_value) \ + x(BCH_ERR_mount_option, option_not_bool) \ x(ENOMEM, ENOMEM_stripe_buf) \ x(ENOMEM, ENOMEM_replicas_table) \ x(ENOMEM, ENOMEM_cpu_replicas) \ diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 1db11c15b2b9..08ea0cfc4aef 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -314,7 +314,7 @@ int bch2_opt_parse(struct bch_fs *c, if (ret < 0 || (*res != 0 && *res != 1)) { if (err) prt_printf(err, "%s: must be bool", opt->attr.name); - return ret; + return ret < 0 ? ret : -BCH_ERR_option_not_bool; } break; case BCH_OPT_UINT: -- cgit v1.2.3 From ada02c207c0168e0e9173983d00f82c9b5de90c8 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 1 Mar 2024 12:49:24 -0500 Subject: bcachefs: fix lost journal buf wakeup due to improved pipelining The journal_write_done() handler was reworked into a loop in commit 746a33c96b7a ("bcachefs: better journal pipelining"). As part of this, the journal buffer wake was factored into a post-loop branch that executes if at least one journal buffer has completed. The journal buffer processing loop iterates on the journal buffer pointer, however. This means that w refers to the last buffer processed by the loop, which may or may not be done. This also means that if multiple buffers are processed by the loop, only the last is awoken. This lost wakeup behavior has lead to stalling problems in various CI and fstests, such as generic/703. Lift the wake into the loop so each done buffer sees a wake call as it is processed. Signed-off-by: Brian Foster Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 0b49b2383d82..8ded059591c7 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1672,6 +1672,7 @@ static CLOSURE_CALLBACK(journal_write_done) new.unwritten_idx++; } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); + closure_wake_up(&w->wait); completed = true; } @@ -1682,7 +1683,6 @@ static CLOSURE_CALLBACK(journal_write_done) track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], &j->max_in_flight_start, false); - closure_wake_up(&w->wait); journal_wake(j); } -- cgit v1.2.3 From ba81523eaac3df20ea884603bd67a74089900814 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 1 Mar 2024 18:49:09 -0500 Subject: bcachefs: Split out bkey_types.h We're going to need bkey_types.h in bcachefs_ioctl.h in a future patch. Signed-off-by: Kent Overstreet --- fs/bcachefs/bkey.h | 202 +------------------------------------------- fs/bcachefs/bkey_types.h | 213 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 214 insertions(+), 201 deletions(-) create mode 100644 fs/bcachefs/bkey_types.h diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 2a8b8fef539c..cf23ff47bed8 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -4,7 +4,7 @@ #include #include "bcachefs_format.h" - +#include "bkey_types.h" #include "btree_types.h" #include "util.h" #include "vstructs.h" @@ -31,57 +31,6 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *, const struct bkey_format *, const struct bkey_packed *); -/* bkey with split value, const */ -struct bkey_s_c { - const struct bkey *k; - const struct bch_val *v; -}; - -/* bkey with split value */ -struct bkey_s { - union { - struct { - struct bkey *k; - struct bch_val *v; - }; - struct bkey_s_c s_c; - }; -}; - -#define bkey_p_next(_k) vstruct_next(_k) - -static inline struct bkey_i *bkey_next(struct bkey_i *k) -{ - return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s); -} - -#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) - -static inline size_t bkey_val_bytes(const struct bkey *k) -{ - return bkey_val_u64s(k) * sizeof(u64); -} - -static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) -{ - unsigned u64s = BKEY_U64s + val_u64s; - - BUG_ON(u64s > U8_MAX); - k->u64s = u64s; -} - -static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) -{ - set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64))); -} - -#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) - -#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) - -#define bkey_whiteout(_k) \ - ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) - enum bkey_lr_packed { BKEY_PACKED_BOTH, BKEY_PACKED_RIGHT, @@ -550,155 +499,6 @@ static inline void bkey_reassemble(struct bkey_i *dst, memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k)); } -#define bkey_s_null ((struct bkey_s) { .k = NULL }) -#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) - -#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) -#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) - -static inline struct bkey_s bkey_to_s(struct bkey *k) -{ - return (struct bkey_s) { .k = k, .v = NULL }; -} - -static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) -{ - return (struct bkey_s_c) { .k = k, .v = NULL }; -} - -static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) -{ - return (struct bkey_s) { .k = &k->k, .v = &k->v }; -} - -static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) -{ - return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; -} - -/* - * For a given type of value (e.g. struct bch_extent), generates the types for - * bkey + bch_extent - inline, split, split const - and also all the conversion - * functions, which also check that the value is of the correct type. - * - * We use anonymous unions for upcasting - e.g. converting from e.g. a - * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion - * functions. - */ -#define x(name, ...) \ -struct bkey_i_##name { \ - union { \ - struct bkey k; \ - struct bkey_i k_i; \ - }; \ - struct bch_##name v; \ -}; \ - \ -struct bkey_s_c_##name { \ - union { \ - struct { \ - const struct bkey *k; \ - const struct bch_##name *v; \ - }; \ - struct bkey_s_c s_c; \ - }; \ -}; \ - \ -struct bkey_s_##name { \ - union { \ - struct { \ - struct bkey *k; \ - struct bch_##name *v; \ - }; \ - struct bkey_s_c_##name c; \ - struct bkey_s s; \ - struct bkey_s_c s_c; \ - }; \ -}; \ - \ -static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return container_of(&k->k, struct bkey_i_##name, k); \ -} \ - \ -static inline const struct bkey_i_##name * \ -bkey_i_to_##name##_c(const struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return container_of(&k->k, struct bkey_i_##name, k); \ -} \ - \ -static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ - return (struct bkey_s_##name) { \ - .k = k.k, \ - .v = container_of(k.v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ - return (struct bkey_s_c_##name) { \ - .k = k.k, \ - .v = container_of(k.v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ -{ \ - return (struct bkey_s_##name) { \ - .k = &k->k, \ - .v = &k->v, \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name \ -name##_i_to_s_c(const struct bkey_i_##name *k) \ -{ \ - return (struct bkey_s_c_##name) { \ - .k = &k->k, \ - .v = &k->v, \ - }; \ -} \ - \ -static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return (struct bkey_s_##name) { \ - .k = &k->k, \ - .v = container_of(&k->v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name \ -bkey_i_to_s_c_##name(const struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return (struct bkey_s_c_##name) { \ - .k = &k->k, \ - .v = container_of(&k->v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ -{ \ - struct bkey_i_##name *k = \ - container_of(&_k->k, struct bkey_i_##name, k); \ - \ - bkey_init(&k->k); \ - memset(&k->v, 0, sizeof(k->v)); \ - k->k.type = KEY_TYPE_##name; \ - set_bkey_val_bytes(&k->k, sizeof(k->v)); \ - \ - return k; \ -} - -BCH_BKEY_TYPES(); -#undef x - /* byte order helpers */ #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h new file mode 100644 index 000000000000..c9ae9e42b385 --- /dev/null +++ b/fs/bcachefs/bkey_types.h @@ -0,0 +1,213 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_TYPES_H +#define _BCACHEFS_BKEY_TYPES_H + +#include "bcachefs_format.h" + +/* + * bkey_i - bkey with inline value + * bkey_s - bkey with split value + * bkey_s_c - bkey with split value, const + */ + +#define bkey_p_next(_k) vstruct_next(_k) + +static inline struct bkey_i *bkey_next(struct bkey_i *k) +{ + return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s); +} + +#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) + +static inline size_t bkey_val_bytes(const struct bkey *k) +{ + return bkey_val_u64s(k) * sizeof(u64); +} + +static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) +{ + unsigned u64s = BKEY_U64s + val_u64s; + + BUG_ON(u64s > U8_MAX); + k->u64s = u64s; +} + +static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) +{ + set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64))); +} + +#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) + +#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) + +#define bkey_whiteout(_k) \ + ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) + +/* bkey with split value, const */ +struct bkey_s_c { + const struct bkey *k; + const struct bch_val *v; +}; + +/* bkey with split value */ +struct bkey_s { + union { + struct { + struct bkey *k; + struct bch_val *v; + }; + struct bkey_s_c s_c; + }; +}; + +#define bkey_s_null ((struct bkey_s) { .k = NULL }) +#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) + +#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) +#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) + +static inline struct bkey_s bkey_to_s(struct bkey *k) +{ + return (struct bkey_s) { .k = k, .v = NULL }; +} + +static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) +{ + return (struct bkey_s_c) { .k = k, .v = NULL }; +} + +static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) +{ + return (struct bkey_s) { .k = &k->k, .v = &k->v }; +} + +static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) +{ + return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; +} + +/* + * For a given type of value (e.g. struct bch_extent), generates the types for + * bkey + bch_extent - inline, split, split const - and also all the conversion + * functions, which also check that the value is of the correct type. + * + * We use anonymous unions for upcasting - e.g. converting from e.g. a + * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion + * functions. + */ +#define x(name, ...) \ +struct bkey_i_##name { \ + union { \ + struct bkey k; \ + struct bkey_i k_i; \ + }; \ + struct bch_##name v; \ +}; \ + \ +struct bkey_s_c_##name { \ + union { \ + struct { \ + const struct bkey *k; \ + const struct bch_##name *v; \ + }; \ + struct bkey_s_c s_c; \ + }; \ +}; \ + \ +struct bkey_s_##name { \ + union { \ + struct { \ + struct bkey *k; \ + struct bch_##name *v; \ + }; \ + struct bkey_s_c_##name c; \ + struct bkey_s s; \ + struct bkey_s_c s_c; \ + }; \ +}; \ + \ +static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return container_of(&k->k, struct bkey_i_##name, k); \ +} \ + \ +static inline const struct bkey_i_##name * \ +bkey_i_to_##name##_c(const struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return container_of(&k->k, struct bkey_i_##name, k); \ +} \ + \ +static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ + return (struct bkey_s_##name) { \ + .k = k.k, \ + .v = container_of(k.v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ + return (struct bkey_s_c_##name) { \ + .k = k.k, \ + .v = container_of(k.v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ +{ \ + return (struct bkey_s_##name) { \ + .k = &k->k, \ + .v = &k->v, \ + }; \ +} \ + \ +static inline struct bkey_s_c_##name \ +name##_i_to_s_c(const struct bkey_i_##name *k) \ +{ \ + return (struct bkey_s_c_##name) { \ + .k = &k->k, \ + .v = &k->v, \ + }; \ +} \ + \ +static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return (struct bkey_s_##name) { \ + .k = &k->k, \ + .v = container_of(&k->v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_s_c_##name \ +bkey_i_to_s_c_##name(const struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return (struct bkey_s_c_##name) { \ + .k = &k->k, \ + .v = container_of(&k->v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ +{ \ + struct bkey_i_##name *k = \ + container_of(&_k->k, struct bkey_i_##name, k); \ + \ + bkey_init(&k->k); \ + memset(&k->v, 0, sizeof(k->v)); \ + k->k.type = KEY_TYPE_##name; \ + set_bkey_val_bytes(&k->k, sizeof(k->v)); \ + \ + return k; \ +} + +BCH_BKEY_TYPES(); +#undef x + +#endif /* _BCACHEFS_BKEY_TYPES_H */ -- cgit v1.2.3 From d64547999c591c47bfac279fa4027bdbd29c7ea0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 2 Mar 2024 15:30:33 -0500 Subject: bcachefs: copy_(to|from)_user_errcode() we've got some helpers that return errors sanely, move them to a more common location for use in fs-ioctl.c Signed-off-by: Kent Overstreet --- fs/bcachefs/chardev.c | 6 ------ fs/bcachefs/util.h | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index b584d78cb39c..38defa19d52d 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -22,12 +22,6 @@ #include #include -__must_check -static int copy_to_user_errcode(void __user *to, const void *from, unsigned long n) -{ - return copy_to_user(to, from, n) ? -EFAULT : 0; -} - /* returns with ref on ca->ref */ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, unsigned flags) diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index b3d3a7349814..adff17f6f524 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -848,4 +848,20 @@ static inline bool qstr_eq(const struct qstr l, const struct qstr r) void bch2_darray_str_exit(darray_str *); int bch2_split_devs(const char *, darray_str *); +#ifdef __KERNEL__ + +__must_check +static inline int copy_to_user_errcode(void __user *to, const void *from, unsigned long n) +{ + return copy_to_user(to, from, n) ? -EFAULT : 0; +} + +__must_check +static inline int copy_from_user_errcode(void *to, const void __user *from, unsigned long n) +{ + return copy_from_user(to, from, n) ? -EFAULT : 0; +} + +#endif + #endif /* _BCACHEFS_UTIL_H */ -- cgit v1.2.3 From 3a319a2476d27e0b6c3cac3ebf6e3d0b665a06e5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 7 Mar 2024 22:32:06 -0500 Subject: lib/generic-radix-tree.c: Make nodes more reasonably sized this code originally used the page allocator directly, but most code shouldn't do that - PAGE_SIZE varies with architecture, and slab is faster. 4k is also on the large side for typical usage, 512 bytes is a better choice for typical usage that might be somewhat sparse. Signed-off-by: Kent Overstreet --- include/linux/generic-radix-tree.h | 29 ++++++++++++++++------------- lib/generic-radix-tree.c | 35 ++++++++++++----------------------- 2 files changed, 28 insertions(+), 36 deletions(-) diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h index 847413164738..f3512fddf3d7 100644 --- a/include/linux/generic-radix-tree.h +++ b/include/linux/generic-radix-tree.h @@ -5,7 +5,7 @@ * DOC: Generic radix trees/sparse arrays * * Very simple and minimalistic, supporting arbitrary size entries up to - * PAGE_SIZE. + * GENRADIX_NODE_SIZE. * * A genradix is defined with the type it will store, like so: * @@ -45,12 +45,15 @@ struct genradix_root; +#define GENRADIX_NODE_SHIFT 9 +#define GENRADIX_NODE_SIZE (1U << GENRADIX_NODE_SHIFT) + struct __genradix { struct genradix_root *root; }; /* - * NOTE: currently, sizeof(_type) must not be larger than PAGE_SIZE: + * NOTE: currently, sizeof(_type) must not be larger than GENRADIX_NODE_SIZE: */ #define __GENRADIX_INITIALIZER \ @@ -101,14 +104,14 @@ void __genradix_free(struct __genradix *); static inline size_t __idx_to_offset(size_t idx, size_t obj_size) { if (__builtin_constant_p(obj_size)) - BUILD_BUG_ON(obj_size > PAGE_SIZE); + BUILD_BUG_ON(obj_size > GENRADIX_NODE_SIZE); else - BUG_ON(obj_size > PAGE_SIZE); + BUG_ON(obj_size > GENRADIX_NODE_SIZE); if (!is_power_of_2(obj_size)) { - size_t objs_per_page = PAGE_SIZE / obj_size; + size_t objs_per_page = GENRADIX_NODE_SIZE / obj_size; - return (idx / objs_per_page) * PAGE_SIZE + + return (idx / objs_per_page) * GENRADIX_NODE_SIZE + (idx % objs_per_page) * obj_size; } else { return idx * obj_size; @@ -118,9 +121,9 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size) #define __genradix_cast(_radix) (typeof((_radix)->type[0]) *) #define __genradix_obj_size(_radix) sizeof((_radix)->type[0]) #define __genradix_objs_per_page(_radix) \ - (PAGE_SIZE / sizeof((_radix)->type[0])) + (GENRADIX_NODE_SIZE / sizeof((_radix)->type[0])) #define __genradix_page_remainder(_radix) \ - (PAGE_SIZE % sizeof((_radix)->type[0])) + (GENRADIX_NODE_SIZE % sizeof((_radix)->type[0])) #define __genradix_idx_to_offset(_radix, _idx) \ __idx_to_offset(_idx, __genradix_obj_size(_radix)) @@ -217,8 +220,8 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter, iter->offset += obj_size; if (!is_power_of_2(obj_size) && - (iter->offset & (PAGE_SIZE - 1)) + obj_size > PAGE_SIZE) - iter->offset = round_up(iter->offset, PAGE_SIZE); + (iter->offset & (GENRADIX_NODE_SIZE - 1)) + obj_size > GENRADIX_NODE_SIZE) + iter->offset = round_up(iter->offset, GENRADIX_NODE_SIZE); iter->pos++; } @@ -235,8 +238,8 @@ static inline void __genradix_iter_rewind(struct genradix_iter *iter, return; } - if ((iter->offset & (PAGE_SIZE - 1)) == 0) - iter->offset -= PAGE_SIZE % obj_size; + if ((iter->offset & (GENRADIX_NODE_SIZE - 1)) == 0) + iter->offset -= GENRADIX_NODE_SIZE % obj_size; iter->offset -= obj_size; iter->pos--; @@ -263,7 +266,7 @@ static inline void __genradix_iter_rewind(struct genradix_iter *iter, genradix_for_each_from(_radix, _iter, _p, 0) #define genradix_last_pos(_radix) \ - (SIZE_MAX / PAGE_SIZE * __genradix_objs_per_page(_radix) - 1) + (SIZE_MAX / GENRADIX_NODE_SIZE * __genradix_objs_per_page(_radix) - 1) /** * genradix_for_each_reverse - iterate over entry in a genradix, reverse order diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c index 41f1bcdc4488..aaefb9b678c8 100644 --- a/lib/generic-radix-tree.c +++ b/lib/generic-radix-tree.c @@ -5,7 +5,7 @@ #include #include -#define GENRADIX_ARY (PAGE_SIZE / sizeof(struct genradix_node *)) +#define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *)) #define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY) struct genradix_node { @@ -14,13 +14,13 @@ struct genradix_node { struct genradix_node *children[GENRADIX_ARY]; /* Leaf: */ - u8 data[PAGE_SIZE]; + u8 data[GENRADIX_NODE_SIZE]; }; }; static inline int genradix_depth_shift(unsigned depth) { - return PAGE_SHIFT + GENRADIX_ARY_SHIFT * depth; + return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth; } /* @@ -33,7 +33,7 @@ static inline size_t genradix_depth_size(unsigned depth) /* depth that's needed for a genradix that can address up to ULONG_MAX: */ #define GENRADIX_MAX_DEPTH \ - DIV_ROUND_UP(BITS_PER_LONG - PAGE_SHIFT, GENRADIX_ARY_SHIFT) + DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT) #define GENRADIX_DEPTH_MASK \ ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1)) @@ -79,23 +79,12 @@ EXPORT_SYMBOL(__genradix_ptr); static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask) { - struct genradix_node *node; - - node = (struct genradix_node *)__get_free_page(gfp_mask|__GFP_ZERO); - - /* - * We're using pages (not slab allocations) directly for kernel data - * structures, so we need to explicitly inform kmemleak of them in order - * to avoid false positive memory leak reports. - */ - kmemleak_alloc(node, PAGE_SIZE, 1, gfp_mask); - return node; + return kzalloc(GENRADIX_NODE_SIZE, gfp_mask); } static inline void genradix_free_node(struct genradix_node *node) { - kmemleak_free(node); - free_page((unsigned long)node); + kfree(node); } /* @@ -200,7 +189,7 @@ restart: i++; iter->offset = round_down(iter->offset + objs_per_ptr, objs_per_ptr); - iter->pos = (iter->offset >> PAGE_SHIFT) * + iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page; if (i == GENRADIX_ARY) goto restart; @@ -209,7 +198,7 @@ restart: n = n->children[i]; } - return &n->data[iter->offset & (PAGE_SIZE - 1)]; + return &n->data[iter->offset & (GENRADIX_NODE_SIZE - 1)]; } EXPORT_SYMBOL(__genradix_iter_peek); @@ -235,7 +224,7 @@ restart: if (ilog2(iter->offset) >= genradix_depth_shift(level)) { iter->offset = genradix_depth_size(level); - iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page; + iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page; iter->offset -= obj_size_plus_page_remainder; iter->pos--; @@ -251,7 +240,7 @@ restart: size_t objs_per_ptr = genradix_depth_size(level); iter->offset = round_down(iter->offset, objs_per_ptr); - iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page; + iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page; if (!iter->offset) return NULL; @@ -267,7 +256,7 @@ restart: n = n->children[i]; } - return &n->data[iter->offset & (PAGE_SIZE - 1)]; + return &n->data[iter->offset & (GENRADIX_NODE_SIZE - 1)]; } EXPORT_SYMBOL(__genradix_iter_peek_prev); @@ -289,7 +278,7 @@ int __genradix_prealloc(struct __genradix *radix, size_t size, { size_t offset; - for (offset = 0; offset < size; offset += PAGE_SIZE) + for (offset = 0; offset < size; offset += GENRADIX_NODE_SIZE) if (!__genradix_ptr_alloc(radix, offset, gfp_mask)) return -ENOMEM; -- cgit v1.2.3 From 5e105fb806c622cd8b3da0031e74367a3b8a326b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 8 Mar 2024 13:51:38 -0500 Subject: bcachefs: fix bch2_journal_buf_to_text() Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 9f208fcb9575..59d6e262277f 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -84,12 +84,8 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 prt_str(out, "separate_flush "); if (buf->need_flush_to_write_buffer) prt_str(out, "need_flush_to_write_buffer "); - if (buf->need_flush_to_write_buffer) - prt_str(out, "need_flush_to_write_buffer "); - if (buf->write_done) - prt_str(out, "write done "); if (buf->write_started) - prt_str(out, "write started "); + prt_str(out, "write_started "); if (buf->write_allocated) prt_str(out, "write allocated "); if (buf->write_done) -- cgit v1.2.3 From c42006458b4239b7eb50f33af523b67c7a55e42e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 8 Mar 2024 16:10:08 -0500 Subject: bcachefs: Check for writing superblocks with nonsense member seq fields We're seeing some unmountable filesystems due to split brain detection going awry; it seems we somehow wrote out superblocks where we updated the superblock seq without updating any member seq fields. A given device's superblock should always have the main seq equal to it's member seq field, so this is easy to check for. Signed-off-by: Kent Overstreet --- fs/bcachefs/super-io.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 38a5073202c5..010daebf987b 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -470,6 +470,14 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, return ret; } + if (rw == WRITE && + bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) { + prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu", + le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq), + le64_to_cpu(sb->seq)); + return -BCH_ERR_invalid_sb_members_missing; + } + return 0; } -- cgit v1.2.3 From 1fdb9685ed8b7b871632a3d348a948e1b53a17e5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 8 Mar 2024 19:00:25 -0500 Subject: bcachefs: Kill unused flags argument to btree_split() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index fd0bbce11765..642213ef9f79 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -25,8 +25,7 @@ #include static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, - btree_path_idx_t, struct btree *, - struct keylist *, unsigned); + btree_path_idx_t, struct btree *, struct keylist *); static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans, @@ -1473,7 +1472,7 @@ static void btree_split_insert_keys(struct btree_update *as, static int btree_split(struct btree_update *as, struct btree_trans *trans, btree_path_idx_t path, struct btree *b, - struct keylist *keys, unsigned flags) + struct keylist *keys) { struct bch_fs *c = as->c; struct btree *parent = btree_node_parent(trans->paths + path, b); @@ -1574,7 +1573,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, if (parent) { /* Split a non root node */ - ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); + ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); if (ret) goto err; } else if (n3) { @@ -1669,7 +1668,6 @@ bch2_btree_insert_keys_interior(struct btree_update *as, * @path_idx: path that points to current node * @b: node to insert keys into * @keys: list of keys to insert - * @flags: transaction commit flags * * Returns: 0 on success, typically transaction restart error on failure * @@ -1679,7 +1677,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, */ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, btree_path_idx_t path_idx, struct btree *b, - struct keylist *keys, unsigned flags) + struct keylist *keys) { struct bch_fs *c = as->c; struct btree_path *path = trans->paths + path_idx; @@ -1735,7 +1733,7 @@ split: return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); } - return btree_split(as, trans, path_idx, b, keys, flags); + return btree_split(as, trans, path_idx, b, keys); } int bch2_btree_split_leaf(struct btree_trans *trans, @@ -1754,7 +1752,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans, if (IS_ERR(as)) return PTR_ERR(as); - ret = btree_split(as, trans, path, b, NULL, flags); + ret = btree_split(as, trans, path, b, NULL); if (ret) { bch2_btree_update_free(as, trans); return ret; @@ -1964,7 +1962,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_trans_verify_paths(trans); - ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); + ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); if (ret) goto err_free_update; @@ -2035,8 +2033,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, if (parent) { bch2_keylist_add(&as->parent_keys, &n->key); - ret = bch2_btree_insert_node(as, trans, iter->path, - parent, &as->parent_keys, flags); + ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys); if (ret) goto err; } else { -- cgit v1.2.3 From 3e48999816b1d1dba3ca40b1d7dbc324adb72fe2 Mon Sep 17 00:00:00 2001 From: Erick Archer Date: Sun, 10 Mar 2024 12:02:26 +0100 Subject: bcachefs: Prefer struct_size over open coded arithmetic This is an effort to get rid of all multiplications from allocation functions in order to prevent integer overflows [1][2]. As the "op" variable is a pointer to "struct promote_op" and this structure ends in a flexible array: struct promote_op { [...] struct bio_vec bi_inline_vecs[]; }; and the "t" variable is a pointer to "struct journal_seq_blacklist_table" and this structure also ends in a flexible array: struct journal_seq_blacklist_table { [...] struct journal_seq_blacklist_table_entry { u64 start; u64 end; bool dirty; } entries[]; }; the preferred way in the kernel is to use the struct_size() helper to do the arithmetic instead of the argument "size + size * count" in the kzalloc() functions. This way, the code is more readable and safer. Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#open-coded-arithmetic-in-allocator-arguments [1] Link: https://github.com/KSPP/linux/issues/160 [2] Signed-off-by: Erick Archer Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 2 +- fs/bcachefs/journal_seq_blacklist.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 3c574d8873a1..8a556e6d1ab6 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -174,7 +174,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) return ERR_PTR(-BCH_ERR_nopromote_no_writes); - op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_KERNEL); + op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL); if (!op) { ret = -BCH_ERR_nopromote_enomem; goto err; diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index 5acc1276675c..b5303874fc35 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -141,8 +141,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) if (!bl) return 0; - t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr, - GFP_KERNEL); + t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL); if (!t) return -BCH_ERR_ENOMEM_blacklist_table_init; -- cgit v1.2.3 From 06ebc48306acc36ecb4d2eeb41fc719de4aaf442 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 10 Mar 2024 16:24:16 -0400 Subject: bcachefs: fix deletion of indirect extents in btree_gc we need to run the normal extent update path on deletion - bch2_bkey_make_mut() is incorrect when key type is changing. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index b7085e996c44..0e1ef8bef997 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -1586,8 +1586,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans, " should be %u", (bch2_bkey_val_to_text(&buf, c, k), buf.buf), r->refcount)) { - struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0); - + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); ret = PTR_ERR_OR_ZERO(new); if (ret) return ret; @@ -1596,6 +1595,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans, new->k.type = KEY_TYPE_deleted; else *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount); + ret = bch2_trans_update(trans, iter, new, 0); } fsck_err: printbuf_exit(&buf); -- cgit v1.2.3 From b6fc661f098631f4a4db43f47ce8d678350fc9ca Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 10 Mar 2024 16:29:06 -0400 Subject: bcachefs: Fix order of gc_done passes gc_stripes_done() and gc_reflink_done() may do alloc btree updates (i.e. when deleting an indirect extent) - we need bucket gens to be fixed by then. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 0e1ef8bef997..550d71bcedc4 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -1818,10 +1818,10 @@ out: if (!ret) { bch2_journal_block(&c->journal); - ret = bch2_gc_stripes_done(c, metadata_only) ?: - bch2_gc_reflink_done(c, metadata_only) ?: - bch2_gc_alloc_done(c, metadata_only) ?: - bch2_gc_done(c, initial, metadata_only); + ret = bch2_gc_alloc_done(c, metadata_only) ?: + bch2_gc_done(c, initial, metadata_only) ?: + bch2_gc_stripes_done(c, metadata_only) ?: + bch2_gc_reflink_done(c, metadata_only); bch2_journal_unblock(&c->journal); } -- cgit v1.2.3 From 5d04409a62c322494cca0d0d8fef8b7f2d3bcc3f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 10 Mar 2024 23:00:23 -0400 Subject: bcachefs: Always flush write buffer in delete_dead_inodes() Signed-off-by: Kent Overstreet --- fs/bcachefs/inode.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index a3139bb66f77..2b5e06770ab3 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -1181,6 +1181,15 @@ int bch2_delete_dead_inodes(struct bch_fs *c) bool need_another_pass; int ret; again: + /* + * if we ran check_inodes() unlinked inodes will have already been + * cleaned up but the write buffer will be out of sync; therefore we + * alway need a write buffer flush + */ + ret = bch2_btree_write_buffer_flush_sync(trans); + if (ret) + goto err; + need_another_pass = false; /* @@ -1213,12 +1222,8 @@ again: ret; })); - if (!ret && need_another_pass) { - ret = bch2_btree_write_buffer_flush_sync(trans); - if (ret) - goto err; + if (!ret && need_another_pass) goto again; - } err: bch2_trans_put(trans); return ret; -- cgit v1.2.3 From b3f8e711171909fe3179c806445a62b573a9711e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 10 Mar 2024 20:53:17 -0400 Subject: bcachefs: Fix btree key cache coherency during replay Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 8 +++++--- fs/bcachefs/btree_update.c | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 74e52fd28abe..8a71d43444b9 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -380,9 +380,11 @@ static int btree_key_cache_fill(struct btree_trans *trans, struct bkey_i *new_k = NULL; int ret; - k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos, - BTREE_ITER_KEY_CACHE_FILL| - BTREE_ITER_CACHED_NOFILL); + bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos, + BTREE_ITER_KEY_CACHE_FILL| + BTREE_ITER_CACHED_NOFILL); + iter.flags &= ~BTREE_ITER_WITH_JOURNAL; + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 45994fbac645..a4b40c1656a5 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -452,7 +452,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, * the key cache - but the key has to exist in the btree for that to * work: */ - if (path->cached && bkey_deleted(&i->old_k)) + if (path->cached && !i->old_btree_u64s) return flush_new_cached_update(trans, i, flags, ip); return 0; -- cgit v1.2.3 From 3bbed372141705f820c5942bbe4e50c6126cdc9a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 10 Mar 2024 23:34:19 -0400 Subject: bcachefs: fix bch_folio_sector padding Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io-pagecache.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h index 8cbaba6565b4..828c3d7c8f19 100644 --- a/fs/bcachefs/fs-io-pagecache.h +++ b/fs/bcachefs/fs-io-pagecache.h @@ -51,13 +51,10 @@ enum bch_folio_sector_state { struct bch_folio_sector { /* Uncompressed, fully allocated replicas (or on disk reservation): */ - unsigned nr_replicas:4; - + u8 nr_replicas:4, /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ - unsigned replicas_reserved:4; - - /* i_sectors: */ - enum bch_folio_sector_state state:8; + replicas_reserved:4; + u8 state; }; struct bch_folio { -- cgit v1.2.3 From cdce109431f30108665a7de72b542af7f02cf966 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Mar 2024 21:15:26 -0400 Subject: bcachefs: reconstruct_alloc cleanup Now that we've got the errors_silent mechanism, we don't have to check if the reconstruct_alloc option is set all over the place. Also - users no longer have to explicitly select fsck and fix_errors. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 62 +++++++++++++++----------------- fs/bcachefs/backpointers.c | 3 +- fs/bcachefs/btree_gc.c | 82 ++++++++++++++++++++---------------------- fs/bcachefs/lru.c | 3 +- fs/bcachefs/recovery.c | 51 +++++++++++++++++++------- fs/bcachefs/sb-downgrade.c | 2 +- fs/bcachefs/util.h | 5 +++ 7 files changed, 113 insertions(+), 95 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index ccd6cbfd470e..c47f72f2bd58 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1052,14 +1052,13 @@ int bch2_check_alloc_key(struct btree_trans *trans, if (ret) goto err; - if (k.k->type != discard_key_type && - (c->opts.reconstruct_alloc || - fsck_err(c, need_discard_key_wrong, - "incorrect key in need_discard btree (got %s should be %s)\n" - " %s", - bch2_bkey_types[k.k->type], - bch2_bkey_types[discard_key_type], - (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { + if (fsck_err_on(k.k->type != discard_key_type, + c, need_discard_key_wrong, + "incorrect key in need_discard btree (got %s should be %s)\n" + " %s", + bch2_bkey_types[k.k->type], + bch2_bkey_types[discard_key_type], + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(*update)); @@ -1083,15 +1082,14 @@ int bch2_check_alloc_key(struct btree_trans *trans, if (ret) goto err; - if (k.k->type != freespace_key_type && - (c->opts.reconstruct_alloc || - fsck_err(c, freespace_key_wrong, - "incorrect key in freespace btree (got %s should be %s)\n" - " %s", - bch2_bkey_types[k.k->type], - bch2_bkey_types[freespace_key_type], - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { + if (fsck_err_on(k.k->type != freespace_key_type, + c, freespace_key_wrong, + "incorrect key in freespace btree (got %s should be %s)\n" + " %s", + bch2_bkey_types[k.k->type], + bch2_bkey_types[freespace_key_type], + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(*update)); @@ -1115,14 +1113,13 @@ int bch2_check_alloc_key(struct btree_trans *trans, if (ret) goto err; - if (a->gen != alloc_gen(k, gens_offset) && - (c->opts.reconstruct_alloc || - fsck_err(c, bucket_gens_key_wrong, - "incorrect gen in bucket_gens btree (got %u should be %u)\n" - " %s", - alloc_gen(k, gens_offset), a->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { + if (fsck_err_on(a->gen != alloc_gen(k, gens_offset), + c, bucket_gens_key_wrong, + "incorrect gen in bucket_gens btree (got %u should be %u)\n" + " %s", + alloc_gen(k, gens_offset), a->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { struct bkey_i_bucket_gens *g = bch2_trans_kmalloc(trans, sizeof(*g)); @@ -1174,14 +1171,13 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans, *end = bkey_min(k.k->p, *end); - if (k.k->type != KEY_TYPE_set && - (c->opts.reconstruct_alloc || - fsck_err(c, freespace_hole_missing, - "hole in alloc btree missing in freespace btree\n" - " device %llu buckets %llu-%llu", - freespace_iter->pos.inode, - freespace_iter->pos.offset, - end->offset))) { + if (fsck_err_on(k.k->type != KEY_TYPE_set, + c, freespace_hole_missing, + "hole in alloc btree missing in freespace btree\n" + " device %llu buckets %llu-%llu", + freespace_iter->pos.inode, + freespace_iter->pos.offset, + end->offset)) { struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(*update)); diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index f2b33fe40bd8..8cb35ea572cb 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -477,8 +477,7 @@ missing: prt_printf(&buf, "\nbp pos "); bch2_bpos_to_text(&buf, bp_iter.pos); - if (c->opts.reconstruct_alloc || - fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) + if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); goto out; diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 550d71bcedc4..584aee7010de 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -593,16 +593,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr); - if (!g->gen_valid && - (c->opts.reconstruct_alloc || - fsck_err(c, ptr_to_missing_alloc_key, - "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { + if (fsck_err_on(!g->gen_valid, + c, ptr_to_missing_alloc_key, + "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { if (!p.ptr.cached) { g->gen_valid = true; g->gen = p.ptr.gen; @@ -611,16 +610,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id } } - if (gen_cmp(p.ptr.gen, g->gen) > 0 && - (c->opts.reconstruct_alloc || - fsck_err(c, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { + if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, + c, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { if (!p.ptr.cached) { g->gen_valid = true; g->gen = p.ptr.gen; @@ -633,28 +631,26 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id } } - if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX && - (c->opts.reconstruct_alloc || - fsck_err(c, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) + if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, + c, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) do_update = true; - if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 && - (c->opts.reconstruct_alloc || - fsck_err(c, stale_dirty_ptr, - "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) + if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0, + c, stale_dirty_ptr, + "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) do_update = true; if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) @@ -1411,8 +1407,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, if (gen_after(old->gen, gc.gen)) return 0; - if (c->opts.reconstruct_alloc || - fsck_err_on(new.data_type != gc.data_type, c, + if (fsck_err_on(new.data_type != gc.data_type, c, alloc_key_data_type_wrong, "bucket %llu:%llu gen %u has wrong data_type" ": got %s, should be %s", @@ -1423,8 +1418,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, new.data_type = gc.data_type; #define copy_bucket_field(_errtype, _f) \ - if (c->opts.reconstruct_alloc || \ - fsck_err_on(new._f != gc._f, c, _errtype, \ + if (fsck_err_on(new._f != gc._f, c, _errtype, \ "bucket %llu:%llu gen %u data type %s has wrong " #_f \ ": got %u, should be %u", \ iter->pos.inode, iter->pos.offset, \ diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index ed7577cdb212..26569043e368 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -125,8 +125,7 @@ static int bch2_check_lru_key(struct btree_trans *trans, goto out; } - if (c->opts.reconstruct_alloc || - fsck_err(c, lru_entry_bad, + if (fsck_err(c, lru_entry_bad, "incorrect lru entry: lru %s time %llu\n" " %s\n" " for %s", diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 0c579ba8de78..2af219aedfdb 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -52,14 +52,47 @@ static bool btree_id_is_alloc(enum btree_id id) } /* for -o reconstruct_alloc: */ -static void drop_alloc_keys(struct journal_keys *keys) +static void do_reconstruct_alloc(struct bch_fs *c) { + bch2_journal_log_msg(c, "dropping alloc info"); + bch_info(c, "dropping and reconstructing all alloc info"); + + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required); + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required); + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required); + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required); + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required); + + __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent); + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + + struct journal_keys *keys = &c->journal_keys; size_t src, dst; for (src = 0, dst = 0; src < keys->nr; src++) if (!btree_id_is_alloc(keys->data[src].btree_id)) keys->data[dst++] = keys->data[src]; - keys->nr = dst; } @@ -395,11 +428,8 @@ static int read_btree_roots(struct bch_fs *c) if (!r->alive) continue; - if (btree_id_is_alloc(i) && - c->opts.reconstruct_alloc) { - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc) continue; - } if (r->error) { __fsck_err(c, @@ -930,10 +960,8 @@ use_clean: c->journal_replay_seq_start = last_seq; c->journal_replay_seq_end = blacklist_seq - 1; - if (c->opts.reconstruct_alloc) { - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); - drop_alloc_keys(&c->journal_keys); - } + if (c->opts.reconstruct_alloc) + do_reconstruct_alloc(c); zero_out_btree_mem_ptr(&c->journal_keys); @@ -968,9 +996,6 @@ use_clean: if (ret) goto err; - if (c->opts.reconstruct_alloc) - bch2_journal_log_msg(c, "dropping alloc info"); - /* * Skip past versions that might have possibly been used (as nonces), * but hadn't had their pointers written: diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 49bc4951572c..e4396cb0bacb 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -259,7 +259,7 @@ void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_mi if (e < BCH_SB_ERR_MAX) __set_bit(e, c->sb.errors_silent); if (e < sizeof(ext->errors_silent) * 8) - ext->errors_silent[e / 64] |= cpu_to_le64(BIT_ULL(e % 64)); + __set_bit_le64(e, ext->errors_silent); } } } diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index adff17f6f524..4aba415ab5b8 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -864,4 +864,9 @@ static inline int copy_from_user_errcode(void *to, const void __user *from, unsi #endif +static inline void __set_bit_le64(size_t bit, __le64 *addr) +{ + addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64)); +} + #endif /* _BCACHEFS_UTIL_H */ -- cgit v1.2.3 From f1ca1abfb0275db241363743ed3606b25b2b1a5c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 13 Mar 2024 20:16:40 -0400 Subject: bcachefs: pull out time_stats.[ch] prep work for lifting out of fs/bcachefs/ Signed-off-by: Kent Overstreet --- fs/bcachefs/Makefile | 1 + fs/bcachefs/alloc_foreground.c | 13 +-- fs/bcachefs/bcachefs.h | 3 +- fs/bcachefs/journal.c | 3 +- fs/bcachefs/journal_io.c | 3 +- fs/bcachefs/journal_reclaim.c | 9 +- fs/bcachefs/journal_types.h | 5 -- fs/bcachefs/time_stats.c | 163 ++++++++++++++++++++++++++++++++++ fs/bcachefs/time_stats.h | 132 +++++++++++++++++++++++++++ fs/bcachefs/util.c | 196 ++++------------------------------------- fs/bcachefs/util.h | 77 +--------------- 11 files changed, 326 insertions(+), 279 deletions(-) create mode 100644 fs/bcachefs/time_stats.c create mode 100644 fs/bcachefs/time_stats.h diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 3433959d4f35..b02796c8a595 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -82,6 +82,7 @@ bcachefs-y := \ super-io.o \ sysfs.o \ tests.o \ + time_stats.o \ thread_with_file.o \ trace.o \ two_state_shared_lock.o \ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 633d3223b353..ca58193dd902 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -236,8 +236,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * if (cl) closure_wait(&c->open_buckets_wait, cl); - track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], - &c->blocked_allocate_open_bucket, true); + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true); spin_unlock(&c->freelist_lock); return ERR_PTR(-BCH_ERR_open_buckets_empty); } @@ -263,11 +262,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * ca->nr_open_buckets++; bch2_open_bucket_hash_add(c, ob); - track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], - &c->blocked_allocate_open_bucket, false); - - track_event_change(&c->times[BCH_TIME_blocked_allocate], - &c->blocked_allocate, false); + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false); + track_event_change(&c->times[BCH_TIME_blocked_allocate], false); spin_unlock(&c->freelist_lock); return ob; @@ -555,8 +551,7 @@ again: goto again; } - track_event_change(&c->times[BCH_TIME_blocked_allocate], - &c->blocked_allocate, true); + track_event_change(&c->times[BCH_TIME_blocked_allocate], true); ob = ERR_PTR(-BCH_ERR_freelist_empty); goto err; diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index af7c798945f1..17a5a7151901 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -212,6 +212,7 @@ #include "recovery_types.h" #include "sb-errors_types.h" #include "seqmutex.h" +#include "time_stats.h" #include "util.h" #ifdef CONFIG_BCACHEFS_DEBUG @@ -924,8 +925,6 @@ struct bch_fs { /* ALLOCATOR */ spinlock_t freelist_lock; struct closure_waitlist freelist_wait; - u64 blocked_allocate; - u64 blocked_allocate_open_bucket; open_bucket_idx_t open_buckets_freelist; open_bucket_idx_t open_buckets_nr_free; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 59d6e262277f..f314b2e78ec3 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -566,8 +566,7 @@ out: ret = -BCH_ERR_journal_res_get_blocked; if (ret == JOURNAL_ERR_max_in_flight && - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], - &j->max_in_flight_start, true)) { + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) { struct printbuf buf = PRINTBUF; prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 8ded059591c7..d76c3c0c203f 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1680,8 +1680,7 @@ static CLOSURE_CALLBACK(journal_write_done) bch2_journal_reclaim_fast(j); bch2_journal_space_available(j); - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], - &j->max_in_flight_start, false); + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false); journal_wake(j); } diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index b975b5727471..ab811c0dad26 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -62,12 +62,9 @@ void bch2_journal_set_watermark(struct journal *j) ? BCH_WATERMARK_reclaim : BCH_WATERMARK_stripe; - if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], - &j->low_on_space_start, low_on_space) || - track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], - &j->low_on_pin_start, low_on_pin) || - track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], - &j->write_buffer_full_start, low_on_wb)) + if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) || + track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) || + track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb)) trace_and_count(c, journal_full, c); swap(watermark, j->watermark); diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 3696aac3ccb7..8c053cb64ca5 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -287,11 +287,6 @@ struct journal { u64 nr_noflush_writes; u64 entry_bytes_written; - u64 low_on_space_start; - u64 low_on_pin_start; - u64 max_in_flight_start; - u64 write_buffer_full_start; - struct bch2_time_stats *flush_write_time; struct bch2_time_stats *noflush_write_time; struct bch2_time_stats *flush_seq_time; diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c new file mode 100644 index 000000000000..d2f7ddbf6c31 --- /dev/null +++ b/fs/bcachefs/time_stats.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include + +#include "eytzinger.h" +#include "time_stats.h" + +static const struct time_unit time_units[] = { + { "ns", 1 }, + { "us", NSEC_PER_USEC }, + { "ms", NSEC_PER_MSEC }, + { "s", NSEC_PER_SEC }, + { "m", (u64) NSEC_PER_SEC * 60}, + { "h", (u64) NSEC_PER_SEC * 3600}, + { "eon", U64_MAX }, +}; + +const struct time_unit *bch2_pick_time_units(u64 ns) +{ + const struct time_unit *u; + + for (u = time_units; + u + 1 < time_units + ARRAY_SIZE(time_units) && + ns >= u[1].nsecs << 1; + u++) + ; + + return u; +} + +static void quantiles_update(struct quantiles *q, u64 v) +{ + unsigned i = 0; + + while (i < ARRAY_SIZE(q->entries)) { + struct quantile_entry *e = q->entries + i; + + if (unlikely(!e->step)) { + e->m = v; + e->step = max_t(unsigned, v / 2, 1024); + } else if (e->m > v) { + e->m = e->m >= e->step + ? e->m - e->step + : 0; + } else if (e->m < v) { + e->m = e->m + e->step > e->m + ? e->m + e->step + : U32_MAX; + } + + if ((e->m > v ? e->m - v : v - e->m) < e->step) + e->step = max_t(unsigned, e->step / 2, 1); + + if (v >= e->m) + break; + + i = eytzinger0_child(i, v > e->m); + } +} + +static inline void time_stats_update_one(struct bch2_time_stats *stats, + u64 start, u64 end) +{ + u64 duration, freq; + + if (time_after64(end, start)) { + duration = end - start; + mean_and_variance_update(&stats->duration_stats, duration); + mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration); + stats->max_duration = max(stats->max_duration, duration); + stats->min_duration = min(stats->min_duration, duration); + stats->total_duration += duration; + + if (stats->quantiles_enabled) + quantiles_update(&stats->quantiles, duration); + } + + if (stats->last_event && time_after64(end, stats->last_event)) { + freq = end - stats->last_event; + mean_and_variance_update(&stats->freq_stats, freq); + mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq); + stats->max_freq = max(stats->max_freq, freq); + stats->min_freq = min(stats->min_freq, freq); + } + + stats->last_event = end; +} + +void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, + struct time_stat_buffer *b) +{ + for (struct time_stat_buffer_entry *i = b->entries; + i < b->entries + ARRAY_SIZE(b->entries); + i++) + time_stats_update_one(stats, i->start, i->end); + b->nr = 0; +} + +static noinline void time_stats_clear_buffer(struct bch2_time_stats *stats, + struct time_stat_buffer *b) +{ + unsigned long flags; + + spin_lock_irqsave(&stats->lock, flags); + __bch2_time_stats_clear_buffer(stats, b); + spin_unlock_irqrestore(&stats->lock, flags); +} + +void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) +{ + unsigned long flags; + + WARN_ONCE(!stats->duration_stats_weighted.weight || + !stats->freq_stats_weighted.weight, + "uninitialized bch2_time_stats"); + + if (!stats->buffer) { + spin_lock_irqsave(&stats->lock, flags); + time_stats_update_one(stats, start, end); + + if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 && + stats->duration_stats.n > 1024) + stats->buffer = + alloc_percpu_gfp(struct time_stat_buffer, + GFP_ATOMIC); + spin_unlock_irqrestore(&stats->lock, flags); + } else { + struct time_stat_buffer *b; + + preempt_disable(); + b = this_cpu_ptr(stats->buffer); + + BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); + b->entries[b->nr++] = (struct time_stat_buffer_entry) { + .start = start, + .end = end + }; + + if (unlikely(b->nr == ARRAY_SIZE(b->entries))) + time_stats_clear_buffer(stats, b); + preempt_enable(); + } +} + +void bch2_time_stats_exit(struct bch2_time_stats *stats) +{ + free_percpu(stats->buffer); +} + +void bch2_time_stats_init(struct bch2_time_stats *stats) +{ + memset(stats, 0, sizeof(*stats)); + stats->duration_stats_weighted.weight = 8; + stats->freq_stats_weighted.weight = 8; + stats->min_duration = U64_MAX; + stats->min_freq = U64_MAX; + spin_lock_init(&stats->lock); +} diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h new file mode 100644 index 000000000000..d0291ea863ba --- /dev/null +++ b/fs/bcachefs/time_stats.h @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * bch2_time_stats - collect statistics on events that have a duration, with nicely + * formatted textual output on demand + * + * - percpu buffering of event collection: cheap enough to shotgun + * everywhere without worrying about overhead + * + * tracks: + * - number of events + * - maximum event duration ever seen + * - sum of all event durations + * - average event duration, standard and weighted + * - standard deviation of event durations, standard and weighted + * and analagous statistics for the frequency of events + * + * We provide both mean and weighted mean (exponentially weighted), and standard + * deviation and weighted standard deviation, to give an efficient-to-compute + * view of current behaviour versus. average behaviour - "did this event source + * just become wonky, or is this typical?". + * + * Particularly useful for tracking down latency issues. + */ +#ifndef _BCACHEFS_TIME_STATS_H +#define _BCACHEFS_TIME_STATS_H + +#include +#include + +#include "mean_and_variance.h" + +struct time_unit { + const char *name; + u64 nsecs; +}; + +/* + * given a nanosecond value, pick the preferred time units for printing: + */ +const struct time_unit *bch2_pick_time_units(u64 ns); + +/* + * quantiles - do not use: + * + * Only enabled if bch2_time_stats->quantiles_enabled has been manually set - don't + * use in new code. + */ + +#define NR_QUANTILES 15 +#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) +#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) +#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) + +struct quantiles { + struct quantile_entry { + u64 m; + u64 step; + } entries[NR_QUANTILES]; +}; + +struct time_stat_buffer { + unsigned nr; + struct time_stat_buffer_entry { + u64 start; + u64 end; + } entries[32]; +}; + +struct bch2_time_stats { + spinlock_t lock; + bool quantiles_enabled; + /* all fields are in nanoseconds */ + u64 min_duration; + u64 max_duration; + u64 total_duration; + u64 max_freq; + u64 min_freq; + u64 last_event; + u64 last_event_start; + struct quantiles quantiles; + + struct mean_and_variance duration_stats; + struct mean_and_variance_weighted duration_stats_weighted; + struct mean_and_variance freq_stats; + struct mean_and_variance_weighted freq_stats_weighted; + struct time_stat_buffer __percpu *buffer; +}; + +void __bch2_time_stats_clear_buffer(struct bch2_time_stats *, struct time_stat_buffer *); +void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); + +/** + * time_stats_update - collect a new event being tracked + * + * @stats - bch2_time_stats to update + * @start - start time of event, recorded with local_clock() + * + * The end duration of the event will be the current time + */ +static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) +{ + __bch2_time_stats_update(stats, start, local_clock()); +} + +/** + * track_event_change - track state change events + * + * @stats - bch2_time_stats to update + * @v - new state, true or false + * + * Use this when tracking time stats for state changes, i.e. resource X becoming + * blocked/unblocked. + */ +static inline bool track_event_change(struct bch2_time_stats *stats, bool v) +{ + if (v != !!stats->last_event_start) { + if (!v) { + bch2_time_stats_update(stats, stats->last_event_start); + stats->last_event_start = 0; + } else { + stats->last_event_start = local_clock() ?: 1; + return true; + } + } + + return false; +} + +void bch2_time_stats_exit(struct bch2_time_stats *); +void bch2_time_stats_init(struct bch2_time_stats *); + +#endif /* _BCACHEFS_TIME_STATS_H */ diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index bd3687a726f7..96de039fc890 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -337,157 +337,16 @@ void bch2_prt_datetime(struct printbuf *out, time64_t sec) } #endif -static const struct time_unit { - const char *name; - u64 nsecs; -} time_units[] = { - { "ns", 1 }, - { "us", NSEC_PER_USEC }, - { "ms", NSEC_PER_MSEC }, - { "s", NSEC_PER_SEC }, - { "m", (u64) NSEC_PER_SEC * 60}, - { "h", (u64) NSEC_PER_SEC * 3600}, - { "eon", U64_MAX }, -}; - -static const struct time_unit *pick_time_units(u64 ns) -{ - const struct time_unit *u; - - for (u = time_units; - u + 1 < time_units + ARRAY_SIZE(time_units) && - ns >= u[1].nsecs << 1; - u++) - ; - - return u; -} - void bch2_pr_time_units(struct printbuf *out, u64 ns) { - const struct time_unit *u = pick_time_units(ns); + const struct time_unit *u = bch2_pick_time_units(ns); prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); } -/* time stats: */ - -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v) -{ - unsigned i = 0; - - while (i < ARRAY_SIZE(q->entries)) { - struct bch2_quantile_entry *e = q->entries + i; - - if (unlikely(!e->step)) { - e->m = v; - e->step = max_t(unsigned, v / 2, 1024); - } else if (e->m > v) { - e->m = e->m >= e->step - ? e->m - e->step - : 0; - } else if (e->m < v) { - e->m = e->m + e->step > e->m - ? e->m + e->step - : U32_MAX; - } - - if ((e->m > v ? e->m - v : v - e->m) < e->step) - e->step = max_t(unsigned, e->step / 2, 1); - - if (v >= e->m) - break; - - i = eytzinger0_child(i, v > e->m); - } -} - -static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, - u64 start, u64 end) -{ - u64 duration, freq; - - if (time_after64(end, start)) { - duration = end - start; - mean_and_variance_update(&stats->duration_stats, duration); - mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration); - stats->max_duration = max(stats->max_duration, duration); - stats->min_duration = min(stats->min_duration, duration); - stats->total_duration += duration; - bch2_quantiles_update(&stats->quantiles, duration); - } - - if (stats->last_event && time_after64(end, stats->last_event)) { - freq = end - stats->last_event; - mean_and_variance_update(&stats->freq_stats, freq); - mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq); - stats->max_freq = max(stats->max_freq, freq); - stats->min_freq = min(stats->min_freq, freq); - } - - stats->last_event = end; -} - -static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, - struct bch2_time_stat_buffer *b) -{ - for (struct bch2_time_stat_buffer_entry *i = b->entries; - i < b->entries + ARRAY_SIZE(b->entries); - i++) - bch2_time_stats_update_one(stats, i->start, i->end); - b->nr = 0; -} - -static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, - struct bch2_time_stat_buffer *b) -{ - unsigned long flags; - - spin_lock_irqsave(&stats->lock, flags); - __bch2_time_stats_clear_buffer(stats, b); - spin_unlock_irqrestore(&stats->lock, flags); -} - -void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) -{ - unsigned long flags; - - WARN_ONCE(!stats->duration_stats_weighted.weight || - !stats->freq_stats_weighted.weight, - "uninitialized time_stats"); - - if (!stats->buffer) { - spin_lock_irqsave(&stats->lock, flags); - bch2_time_stats_update_one(stats, start, end); - - if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 && - stats->duration_stats.n > 1024) - stats->buffer = - alloc_percpu_gfp(struct bch2_time_stat_buffer, - GFP_ATOMIC); - spin_unlock_irqrestore(&stats->lock, flags); - } else { - struct bch2_time_stat_buffer *b; - - preempt_disable(); - b = this_cpu_ptr(stats->buffer); - - BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); - b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) { - .start = start, - .end = end - }; - - if (unlikely(b->nr == ARRAY_SIZE(b->entries))) - bch2_time_stats_clear_buffer(stats, b); - preempt_enable(); - } -} - static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) { - const struct time_unit *u = pick_time_units(ns); + const struct time_unit *u = bch2_pick_time_units(ns); prt_printf(out, "%llu ", div64_u64(ns, u->nsecs)); prt_tab_rjust(out); @@ -506,10 +365,8 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) { - const struct time_unit *u; s64 f_mean = 0, d_mean = 0; - u64 q, last_q = 0, f_stddev = 0, d_stddev = 0; - int i; + u64 f_stddev = 0, d_stddev = 0; if (stats->buffer) { int cpu; @@ -608,39 +465,24 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats printbuf_tabstops_reset(out); - i = eytzinger0_first(NR_QUANTILES); - u = pick_time_units(stats->quantiles.entries[i].m); - - prt_printf(out, "quantiles (%s):\t", u->name); - eytzinger0_for_each(i, NR_QUANTILES) { - bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; - - q = max(stats->quantiles.entries[i].m, last_q); - prt_printf(out, "%llu ", - div_u64(q, u->nsecs)); - if (is_last) - prt_newline(out); - last_q = q; + if (stats->quantiles_enabled) { + int i = eytzinger0_first(NR_QUANTILES); + const struct time_unit *u = + bch2_pick_time_units(stats->quantiles.entries[i].m); + u64 last_q = 0; + + prt_printf(out, "quantiles (%s):\t", u->name); + eytzinger0_for_each(i, NR_QUANTILES) { + bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; + + u64 q = max(stats->quantiles.entries[i].m, last_q); + prt_printf(out, "%llu ", div_u64(q, u->nsecs)); + if (is_last) + prt_newline(out); + last_q = q; + } } } -#else -void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {} -#endif - -void bch2_time_stats_exit(struct bch2_time_stats *stats) -{ - free_percpu(stats->buffer); -} - -void bch2_time_stats_init(struct bch2_time_stats *stats) -{ - memset(stats, 0, sizeof(*stats)); - stats->duration_stats_weighted.weight = 8; - stats->freq_stats_weighted.weight = 8; - stats->min_duration = U64_MAX; - stats->min_freq = U64_MAX; - spin_lock_init(&stats->lock); -} /* ratelimit: */ diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 4aba415ab5b8..7ffbddb80400 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -21,6 +21,7 @@ #include "mean_and_variance.h" #include "darray.h" +#include "time_stats.h" struct closure; @@ -329,84 +330,8 @@ static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev) #endif } -#define NR_QUANTILES 15 -#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) -#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) -#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) - -struct bch2_quantiles { - struct bch2_quantile_entry { - u64 m; - u64 step; - } entries[NR_QUANTILES]; -}; - -struct bch2_time_stat_buffer { - unsigned nr; - struct bch2_time_stat_buffer_entry { - u64 start; - u64 end; - } entries[32]; -}; - -struct bch2_time_stats { - spinlock_t lock; - /* all fields are in nanoseconds */ - u64 min_duration; - u64 max_duration; - u64 total_duration; - u64 max_freq; - u64 min_freq; - u64 last_event; - struct bch2_quantiles quantiles; - - struct mean_and_variance duration_stats; - struct mean_and_variance_weighted duration_stats_weighted; - struct mean_and_variance freq_stats; - struct mean_and_variance_weighted freq_stats_weighted; - struct bch2_time_stat_buffer __percpu *buffer; -}; - -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); - -static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) -{ - __bch2_time_stats_update(stats, start, local_clock()); -} - -static inline bool track_event_change(struct bch2_time_stats *stats, - u64 *start, bool v) -{ - if (v != !!*start) { - if (!v) { - bch2_time_stats_update(stats, *start); - *start = 0; - } else { - *start = local_clock() ?: 1; - return true; - } - } - - return false; -} -#else -static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {} -static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {} -static inline bool track_event_change(struct bch2_time_stats *stats, - u64 *start, bool v) -{ - bool ret = v && !*start; - *start = v; - return ret; -} -#endif - void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *); -void bch2_time_stats_exit(struct bch2_time_stats *); -void bch2_time_stats_init(struct bch2_time_stats *); - #define ewma_add(ewma, val, weight) \ ({ \ typeof(ewma) _ewma = (ewma); \ -- cgit v1.2.3 From cdbfa228a5537dfd7cbd8532701b0c8af70c97b8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 5 Feb 2024 10:50:15 -0800 Subject: bcachefs: time_stats: add larger units Filesystems can stay mounted for a very long time, so add some larger units. Signed-off-by: Darrick J. Wong Signed-off-by: Kent Overstreet --- fs/bcachefs/time_stats.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c index d2f7ddbf6c31..af97474c445b 100644 --- a/fs/bcachefs/time_stats.c +++ b/fs/bcachefs/time_stats.c @@ -17,6 +17,9 @@ static const struct time_unit time_units[] = { { "s", NSEC_PER_SEC }, { "m", (u64) NSEC_PER_SEC * 60}, { "h", (u64) NSEC_PER_SEC * 3600}, + { "d", (u64) NSEC_PER_SEC * 3600 * 24}, + { "w", (u64) NSEC_PER_SEC * 3600 * 24 * 7}, + { "y", (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */ { "eon", U64_MAX }, }; -- cgit v1.2.3 From 4b4f0876ab74167cc402ccd5ce9154e7dc666829 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 8 Feb 2024 18:33:35 -0500 Subject: bcachefs: mean_and_variance: put struct mean_and_variance_weighted on a diet The only caller of this code (time_stats) always knows the weights and whether or not any information has been collected. Pass this information into the mean and variance code so that it doesn't have to store that information. This reduces the structure size from 24 to 16 bytes, which shrinks each time_stats counter to 192 bytes from 208. Signed-off-by: Darrick J. Wong Signed-off-by: Kent Overstreet --- fs/bcachefs/mean_and_variance.c | 28 ++++++++----- fs/bcachefs/mean_and_variance.h | 14 ++++--- fs/bcachefs/mean_and_variance_test.c | 80 +++++++++++++++++++----------------- fs/bcachefs/time_stats.c | 15 +++---- fs/bcachefs/time_stats.h | 6 ++- fs/bcachefs/util.c | 8 ++-- 6 files changed, 84 insertions(+), 67 deletions(-) diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c index bf0ef668fd38..0ea9f30803a2 100644 --- a/fs/bcachefs/mean_and_variance.c +++ b/fs/bcachefs/mean_and_variance.c @@ -103,14 +103,17 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev); * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update() * @s: mean and variance number of samples and their sums * @x: new value to include in the &mean_and_variance_weighted + * @initted: caller must track whether this is the first use or not + * @weight: ewma weight * * see linked pdf: function derived from equations 140-143 where alpha = 2^w. * values are stored bitshifted for performance and added precision. */ -void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x) +void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, + s64 x, bool initted, u8 weight) { // previous weighted variance. - u8 w = s->weight; + u8 w = weight; u64 var_w0 = s->variance; // new value weighted. s64 x_w = x << w; @@ -119,45 +122,50 @@ void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 // new mean weighted. s64 u_w1 = s->mean + diff; - if (!s->init) { + if (!initted) { s->mean = x_w; s->variance = 0; } else { s->mean = u_w1; s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w; } - s->init = true; } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update); /** * mean_and_variance_weighted_get_mean() - get mean from @s * @s: mean and variance number of samples and their sums + * @weight: ewma weight */ -s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s) +s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s, + u8 weight) { - return fast_divpow2(s.mean, s.weight); + return fast_divpow2(s.mean, weight); } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean); /** * mean_and_variance_weighted_get_variance() -- get variance from @s * @s: mean and variance number of samples and their sums + * @weight: ewma weight */ -u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s) +u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s, + u8 weight) { // always positive don't need fast divpow2 - return s.variance >> s.weight; + return s.variance >> weight; } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance); /** * mean_and_variance_weighted_get_stddev() - get standard deviation from @s * @s: mean and variance number of samples and their sums + * @weight: ewma weight */ -u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s) +u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s, + u8 weight) { - return int_sqrt64(mean_and_variance_weighted_get_variance(s)); + return int_sqrt64(mean_and_variance_weighted_get_variance(s, weight)); } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev); diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h index 64df11ab422b..4fcf062dd22c 100644 --- a/fs/bcachefs/mean_and_variance.h +++ b/fs/bcachefs/mean_and_variance.h @@ -154,8 +154,6 @@ struct mean_and_variance { /* expontentially weighted variant */ struct mean_and_variance_weighted { - bool init; - u8 weight; /* base 2 logarithim */ s64 mean; u64 variance; }; @@ -192,10 +190,14 @@ s64 mean_and_variance_get_mean(struct mean_and_variance s); u64 mean_and_variance_get_variance(struct mean_and_variance s1); u32 mean_and_variance_get_stddev(struct mean_and_variance s); -void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v); +void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, + s64 v, bool initted, u8 weight); -s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s); -u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s); -u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s); +s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s, + u8 weight); +u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s, + u8 weight); +u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s, + u8 weight); #endif // MEAN_AND_VAIRANCE_H_ diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c index 019583c3ca0e..db63b3f3b338 100644 --- a/fs/bcachefs/mean_and_variance_test.c +++ b/fs/bcachefs/mean_and_variance_test.c @@ -31,53 +31,59 @@ static void mean_and_variance_basic_test(struct kunit *test) static void mean_and_variance_weighted_test(struct kunit *test) { - struct mean_and_variance_weighted s = { .weight = 2 }; + struct mean_and_variance_weighted s = { }; - mean_and_variance_weighted_update(&s, 10); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 10); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0); + mean_and_variance_weighted_update(&s, 10, false, 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 10); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0); - mean_and_variance_weighted_update(&s, 20); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 12); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18); + mean_and_variance_weighted_update(&s, 20, true, 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 12); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18); - mean_and_variance_weighted_update(&s, 30); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 16); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72); + mean_and_variance_weighted_update(&s, 30, true, 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 16); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72); - s = (struct mean_and_variance_weighted) { .weight = 2 }; + s = (struct mean_and_variance_weighted) { }; - mean_and_variance_weighted_update(&s, -10); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -10); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0); + mean_and_variance_weighted_update(&s, -10, false, 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -10); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0); - mean_and_variance_weighted_update(&s, -20); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -12); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18); + mean_and_variance_weighted_update(&s, -20, true, 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -12); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18); - mean_and_variance_weighted_update(&s, -30); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -16); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72); + mean_and_variance_weighted_update(&s, -30, true, 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -16); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72); } static void mean_and_variance_weighted_advanced_test(struct kunit *test) { - struct mean_and_variance_weighted s = { .weight = 8 }; + struct mean_and_variance_weighted s = { }; + bool initted = false; s64 i; - for (i = 10; i <= 100; i += 10) - mean_and_variance_weighted_update(&s, i); + for (i = 10; i <= 100; i += 10) { + mean_and_variance_weighted_update(&s, i, initted, 8); + initted = true; + } - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 11); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), 11); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107); - s = (struct mean_and_variance_weighted) { .weight = 8 }; + s = (struct mean_and_variance_weighted) { }; + initted = false; - for (i = -10; i >= -100; i -= 10) - mean_and_variance_weighted_update(&s, i); + for (i = -10; i >= -100; i -= 10) { + mean_and_variance_weighted_update(&s, i, initted, 8); + initted = true; + } - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -11); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), -11); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107); } static void do_mean_and_variance_test(struct kunit *test, @@ -92,26 +98,26 @@ static void do_mean_and_variance_test(struct kunit *test, s64 *weighted_stddev) { struct mean_and_variance mv = {}; - struct mean_and_variance_weighted vw = { .weight = weight }; + struct mean_and_variance_weighted vw = { }; for (unsigned i = 0; i < initial_n; i++) { mean_and_variance_update(&mv, initial_value); - mean_and_variance_weighted_update(&vw, initial_value); + mean_and_variance_weighted_update(&vw, initial_value, false, weight); KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), initial_value); KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), 0); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), initial_value); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),0); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), initial_value); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),0); } for (unsigned i = 0; i < n; i++) { mean_and_variance_update(&mv, data[i]); - mean_and_variance_weighted_update(&vw, data[i]); + mean_and_variance_weighted_update(&vw, data[i], true, weight); KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), mean[i]); KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), stddev[i]); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), weighted_mean[i]); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),weighted_stddev[i]); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), weighted_mean[i]); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),weighted_stddev[i]); } KUNIT_EXPECT_EQ(test, mv.n, initial_n + n); diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c index af97474c445b..4ac6ebfd264c 100644 --- a/fs/bcachefs/time_stats.c +++ b/fs/bcachefs/time_stats.c @@ -70,11 +70,13 @@ static inline void time_stats_update_one(struct bch2_time_stats *stats, u64 start, u64 end) { u64 duration, freq; + bool initted = stats->last_event != 0; if (time_after64(end, start)) { duration = end - start; mean_and_variance_update(&stats->duration_stats, duration); - mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration); + mean_and_variance_weighted_update(&stats->duration_stats_weighted, + duration, initted, TIME_STATS_MV_WEIGHT); stats->max_duration = max(stats->max_duration, duration); stats->min_duration = min(stats->min_duration, duration); stats->total_duration += duration; @@ -86,7 +88,8 @@ static inline void time_stats_update_one(struct bch2_time_stats *stats, if (stats->last_event && time_after64(end, stats->last_event)) { freq = end - stats->last_event; mean_and_variance_update(&stats->freq_stats, freq); - mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq); + mean_and_variance_weighted_update(&stats->freq_stats_weighted, + freq, initted, TIME_STATS_MV_WEIGHT); stats->max_freq = max(stats->max_freq, freq); stats->min_freq = min(stats->min_freq, freq); } @@ -118,15 +121,11 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) { unsigned long flags; - WARN_ONCE(!stats->duration_stats_weighted.weight || - !stats->freq_stats_weighted.weight, - "uninitialized bch2_time_stats"); - if (!stats->buffer) { spin_lock_irqsave(&stats->lock, flags); time_stats_update_one(stats, start, end); - if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 && + if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 && stats->duration_stats.n > 1024) stats->buffer = alloc_percpu_gfp(struct time_stat_buffer, @@ -158,8 +157,6 @@ void bch2_time_stats_exit(struct bch2_time_stats *stats) void bch2_time_stats_init(struct bch2_time_stats *stats) { memset(stats, 0, sizeof(*stats)); - stats->duration_stats_weighted.weight = 8; - stats->freq_stats_weighted.weight = 8; stats->min_duration = U64_MAX; stats->min_freq = U64_MAX; spin_lock_init(&stats->lock); diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h index d0291ea863ba..fd6e442443f9 100644 --- a/fs/bcachefs/time_stats.h +++ b/fs/bcachefs/time_stats.h @@ -80,8 +80,12 @@ struct bch2_time_stats { struct quantiles quantiles; struct mean_and_variance duration_stats; - struct mean_and_variance_weighted duration_stats_weighted; struct mean_and_variance freq_stats; + +/* default weight for weighted mean and variance calculations */ +#define TIME_STATS_MV_WEIGHT 8 + + struct mean_and_variance_weighted duration_stats_weighted; struct mean_and_variance_weighted freq_stats_weighted; struct time_stat_buffer __percpu *buffer; }; diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 96de039fc890..0f11e0c4e46d 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -428,14 +428,14 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats prt_tab(out); bch2_pr_time_units_aligned(out, d_mean); prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); prt_newline(out); prt_printf(out, "stddev:"); prt_tab(out); bch2_pr_time_units_aligned(out, d_stddev); prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); printbuf_indent_sub(out, 2); prt_newline(out); @@ -451,14 +451,14 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats prt_tab(out); bch2_pr_time_units_aligned(out, f_mean); prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); prt_newline(out); prt_printf(out, "stddev:"); prt_tab(out); bch2_pr_time_units_aligned(out, f_stddev); prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); printbuf_indent_sub(out, 2); prt_newline(out); -- cgit v1.2.3 From 273960b8f374b95ebd234a99607b7887f515c791 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 1 Feb 2024 12:41:42 -0800 Subject: bcachefs: time_stats: split stats-with-quantiles into a separate structure Currently, struct time_stats has the optional ability to quantize the information that it collects. This is /probably/ useful for callers who want to see quantized information, but it more than doubles the size of the structure from 224 bytes to 464. For users who don't care about that (e.g. upcoming xfs patches) and want to avoid wasting 240 bytes per counter, split the two into separate pieces. Signed-off-by: Darrick J. Wong Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 2 +- fs/bcachefs/io_write.c | 2 +- fs/bcachefs/super.c | 8 ++++---- fs/bcachefs/sysfs.c | 4 ++-- fs/bcachefs/time_stats.c | 6 ++++-- fs/bcachefs/time_stats.h | 27 +++++++++++++++++++++++++-- fs/bcachefs/util.c | 7 ++++--- 7 files changed, 41 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 17a5a7151901..339dc3e1dcd3 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -598,7 +598,7 @@ struct bch_dev { /* The rest of this all shows up in sysfs */ atomic64_t cur_latency[2]; - struct bch2_time_stats io_latency[2]; + struct bch2_time_stats_quantiles io_latency[2]; #define CONGESTED_MAX 1024 atomic_t congested; diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 150c272368bf..f137252bccc5 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -88,7 +88,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) bch2_congested_acct(ca, io_latency, now, rw); - __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); + __bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now); } #endif diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 961b25860c3b..233f864ed8b0 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1189,8 +1189,8 @@ static void bch2_dev_free(struct bch_dev *ca) bch2_dev_buckets_free(ca); free_page((unsigned long) ca->sb_read_scratch); - bch2_time_stats_exit(&ca->io_latency[WRITE]); - bch2_time_stats_exit(&ca->io_latency[READ]); + bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); + bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); percpu_ref_exit(&ca->io_ref); percpu_ref_exit(&ca->ref); @@ -1281,8 +1281,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, INIT_WORK(&ca->io_error_work, bch2_io_error_work); - bch2_time_stats_init(&ca->io_latency[READ]); - bch2_time_stats_init(&ca->io_latency[WRITE]); + bch2_time_stats_quantiles_init(&ca->io_latency[READ]); + bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]); ca->mi = bch2_mi_to_cpu(member); diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index cee80c47feea..c86a93a8d8fc 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -930,10 +930,10 @@ SHOW(bch2_dev) sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); if (attr == &sysfs_io_latency_stats_read) - bch2_time_stats_to_text(out, &ca->io_latency[READ]); + bch2_time_stats_to_text(out, &ca->io_latency[READ].stats); if (attr == &sysfs_io_latency_stats_write) - bch2_time_stats_to_text(out, &ca->io_latency[WRITE]); + bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats); sysfs_printf(congested, "%u%%", clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c index 4ac6ebfd264c..4508e9dcbee2 100644 --- a/fs/bcachefs/time_stats.c +++ b/fs/bcachefs/time_stats.c @@ -73,6 +73,8 @@ static inline void time_stats_update_one(struct bch2_time_stats *stats, bool initted = stats->last_event != 0; if (time_after64(end, start)) { + struct quantiles *quantiles = time_stats_to_quantiles(stats); + duration = end - start; mean_and_variance_update(&stats->duration_stats, duration); mean_and_variance_weighted_update(&stats->duration_stats_weighted, @@ -81,8 +83,8 @@ static inline void time_stats_update_one(struct bch2_time_stats *stats, stats->min_duration = min(stats->min_duration, duration); stats->total_duration += duration; - if (stats->quantiles_enabled) - quantiles_update(&stats->quantiles, duration); + if (quantiles) + quantiles_update(quantiles, duration); } if (stats->last_event && time_after64(end, stats->last_event)) { diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h index fd6e442443f9..ed6c03c436c0 100644 --- a/fs/bcachefs/time_stats.h +++ b/fs/bcachefs/time_stats.h @@ -26,6 +26,7 @@ #include #include +#include #include "mean_and_variance.h" @@ -68,7 +69,7 @@ struct time_stat_buffer { struct bch2_time_stats { spinlock_t lock; - bool quantiles_enabled; + bool have_quantiles; /* all fields are in nanoseconds */ u64 min_duration; u64 max_duration; @@ -77,7 +78,6 @@ struct bch2_time_stats { u64 min_freq; u64 last_event; u64 last_event_start; - struct quantiles quantiles; struct mean_and_variance duration_stats; struct mean_and_variance freq_stats; @@ -90,6 +90,18 @@ struct bch2_time_stats { struct time_stat_buffer __percpu *buffer; }; +struct bch2_time_stats_quantiles { + struct bch2_time_stats stats; + struct quantiles quantiles; +}; + +static inline struct quantiles *time_stats_to_quantiles(struct bch2_time_stats *stats) +{ + return stats->have_quantiles + ? &container_of(stats, struct bch2_time_stats_quantiles, stats)->quantiles + : NULL; +} + void __bch2_time_stats_clear_buffer(struct bch2_time_stats *, struct time_stat_buffer *); void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); @@ -133,4 +145,15 @@ static inline bool track_event_change(struct bch2_time_stats *stats, bool v) void bch2_time_stats_exit(struct bch2_time_stats *); void bch2_time_stats_init(struct bch2_time_stats *); +static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq) +{ + bch2_time_stats_exit(&statq->stats); +} +static inline void bch2_time_stats_quantiles_init(struct bch2_time_stats_quantiles *statq) +{ + bch2_time_stats_init(&statq->stats); + statq->stats.have_quantiles = true; + memset(&statq->quantiles, 0, sizeof(statq->quantiles)); +} + #endif /* _BCACHEFS_TIME_STATS_H */ diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 0f11e0c4e46d..216fadf16928 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -365,6 +365,7 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) { + struct quantiles *quantiles = time_stats_to_quantiles(stats); s64 f_mean = 0, d_mean = 0; u64 f_stddev = 0, d_stddev = 0; @@ -465,17 +466,17 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats printbuf_tabstops_reset(out); - if (stats->quantiles_enabled) { + if (quantiles) { int i = eytzinger0_first(NR_QUANTILES); const struct time_unit *u = - bch2_pick_time_units(stats->quantiles.entries[i].m); + bch2_pick_time_units(quantiles->entries[i].m); u64 last_q = 0; prt_printf(out, "quantiles (%s):\t", u->name); eytzinger0_for_each(i, NR_QUANTILES) { bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; - u64 q = max(stats->quantiles.entries[i].m, last_q); + u64 q = max(quantiles->entries[i].m, last_q); prt_printf(out, "%llu ", div_u64(q, u->nsecs)); if (is_last) prt_newline(out); -- cgit v1.2.3 From be28368b2ccb328b207c9f66c35bb088d91e6a03 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 5 Feb 2024 13:48:21 -0800 Subject: bcachefs: time_stats: shrink time_stat_buffer for better alignment Shrink this percpu object by one array element so that the object size becomes exactly 512 bytes. This will lead to more efficient memory use, hopefully. Signed-off-by: Darrick J. Wong Signed-off-by: Kent Overstreet --- fs/bcachefs/time_stats.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h index ed6c03c436c0..5df61403744b 100644 --- a/fs/bcachefs/time_stats.h +++ b/fs/bcachefs/time_stats.h @@ -64,7 +64,7 @@ struct time_stat_buffer { struct time_stat_buffer_entry { u64 start; u64 end; - } entries[32]; + } entries[31]; }; struct bch2_time_stats { -- cgit v1.2.3