From f469c8bd90b7d595414e4b1876983dc94d0df47e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 12 Apr 2023 11:33:10 +0100 Subject: btrfs: unexport btrfs_prev_leaf() btrfs_prev_leaf() is not used outside ctree.c, so there's no need to export it at ctree.h - just make it static at ctree.c and move its definition above btrfs_search_slot_for_read(), since that function calls it. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 161 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 81 insertions(+), 80 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 2ff2961b1183..28b0402a008f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2378,6 +2378,87 @@ done: return ret; } +/* + * Search the tree again to find a leaf with smaller keys. + * Returns 0 if it found something. + * Returns 1 if there are no smaller keys. + * Returns < 0 on error. + * + * This may release the path, and so you may lose any locks held at the + * time you call it. + */ +static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) +{ + struct btrfs_key key; + struct btrfs_key orig_key; + struct btrfs_disk_key found_key; + int ret; + + btrfs_item_key_to_cpu(path->nodes[0], &key, 0); + orig_key = key; + + if (key.offset > 0) { + key.offset--; + } else if (key.type > 0) { + key.type--; + key.offset = (u64)-1; + } else if (key.objectid > 0) { + key.objectid--; + key.type = (u8)-1; + key.offset = (u64)-1; + } else { + return 1; + } + + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret <= 0) + return ret; + + /* + * Previous key not found. Even if we were at slot 0 of the leaf we had + * before releasing the path and calling btrfs_search_slot(), we now may + * be in a slot pointing to the same original key - this can happen if + * after we released the path, one of more items were moved from a + * sibling leaf into the front of the leaf we had due to an insertion + * (see push_leaf_right()). + * If we hit this case and our slot is > 0 and just decrement the slot + * so that the caller does not process the same key again, which may or + * may not break the caller, depending on its logic. + */ + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { + btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); + ret = comp_keys(&found_key, &orig_key); + if (ret == 0) { + if (path->slots[0] > 0) { + path->slots[0]--; + return 0; + } + /* + * At slot 0, same key as before, it means orig_key is + * the lowest, leftmost, key in the tree. We're done. + */ + return 1; + } + } + + btrfs_item_key(path->nodes[0], &found_key, 0); + ret = comp_keys(&found_key, &key); + /* + * We might have had an item with the previous key in the tree right + * before we released our path. And after we released our path, that + * item might have been pushed to the first slot (0) of the leaf we + * were holding due to a tree balance. Alternatively, an item with the + * previous key can exist as the only element of a leaf (big fat item). + * Therefore account for these 2 cases, so that our callers (like + * btrfs_previous_item) don't miss an existing item with a key matching + * the previous key we computed above. + */ + if (ret <= 0) + return 0; + return 1; +} + /* * helper to use instead of search slot if no exact match is needed but * instead the next or previous item should be returned. @@ -4473,86 +4554,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, return ret; } -/* - * search the tree again to find a leaf with lesser keys - * returns 0 if it found something or 1 if there are no lesser leaves. - * returns < 0 on io errors. - * - * This may release the path, and so you may lose any locks held at the - * time you call it. - */ -int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) -{ - struct btrfs_key key; - struct btrfs_key orig_key; - struct btrfs_disk_key found_key; - int ret; - - btrfs_item_key_to_cpu(path->nodes[0], &key, 0); - orig_key = key; - - if (key.offset > 0) { - key.offset--; - } else if (key.type > 0) { - key.type--; - key.offset = (u64)-1; - } else if (key.objectid > 0) { - key.objectid--; - key.type = (u8)-1; - key.offset = (u64)-1; - } else { - return 1; - } - - btrfs_release_path(path); - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret <= 0) - return ret; - - /* - * Previous key not found. Even if we were at slot 0 of the leaf we had - * before releasing the path and calling btrfs_search_slot(), we now may - * be in a slot pointing to the same original key - this can happen if - * after we released the path, one of more items were moved from a - * sibling leaf into the front of the leaf we had due to an insertion - * (see push_leaf_right()). - * If we hit this case and our slot is > 0 and just decrement the slot - * so that the caller does not process the same key again, which may or - * may not break the caller, depending on its logic. - */ - if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { - btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); - ret = comp_keys(&found_key, &orig_key); - if (ret == 0) { - if (path->slots[0] > 0) { - path->slots[0]--; - return 0; - } - /* - * At slot 0, same key as before, it means orig_key is - * the lowest, leftmost, key in the tree. We're done. - */ - return 1; - } - } - - btrfs_item_key(path->nodes[0], &found_key, 0); - ret = comp_keys(&found_key, &key); - /* - * We might have had an item with the previous key in the tree right - * before we released our path. And after we released our path, that - * item might have been pushed to the first slot (0) of the leaf we - * were holding due to a tree balance. Alternatively, an item with the - * previous key can exist as the only element of a leaf (big fat item). - * Therefore account for these 2 cases, so that our callers (like - * btrfs_previous_item) don't miss an existing item with a key matching - * the previous key we computed above. - */ - if (ret <= 0) - return 0; - return 1; -} - /* * A helper function to walk down the tree starting at min_key, and looking * for nodes or leaves that are have a minimum transaction id. -- cgit v1.2.3 From 88ad95b055764e4c74fb6b8201f794f4e531753d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 26 Apr 2023 11:51:37 +0100 Subject: btrfs: tag as unlikely the key comparison when checking sibling keys When checking siblings keys, before moving keys from one node/leaf to a sibling node/leaf, it's very unexpected to have the last key of the left sibling greater than or equals to the first key of the right sibling, as that means we have a (serious) corruption that breaks the key ordering properties of a b+tree. Since this is unexpected, surround the comparison with the unlikely macro, which helps the compiler generate better code for the most expected case (no existing b+tree corruption). This is also what we do for other unexpected cases of invalid key ordering (like at btrfs_set_item_key_safe()). Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 28b0402a008f..0321f4286681 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2707,7 +2707,7 @@ static bool check_sibling_keys(struct extent_buffer *left, btrfs_item_key_to_cpu(right, &right_first, 0); } - if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) { + if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) { btrfs_crit(left->fs_info, "left extent buffer:"); btrfs_print_tree(left, false); btrfs_crit(left->fs_info, "right extent buffer:"); -- cgit v1.2.3 From 6c75a589cb35b8ea5cf9a22f389981acf687ab85 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 27 Apr 2023 20:16:27 +0800 Subject: btrfs: print-tree: pass const extent buffer pointer Since print-tree infrastructure only prints the content of a tree block, we can make them to accept const extent buffer pointer. This removes a forced type convert in extent-tree, where we convert a const extent buffer pointer to regular one, just to avoid compiler warning. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 4 ++-- fs/btrfs/ctree.h | 2 +- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/print-tree.c | 16 ++++++++-------- fs/btrfs/print-tree.h | 4 ++-- 5 files changed, 14 insertions(+), 14 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0321f4286681..f76da470d60b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3077,7 +3077,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, * and nr indicate which items in the leaf to check. This totals up the * space used both by the item structs and the item data */ -static int leaf_space_used(struct extent_buffer *l, int start, int nr) +static int leaf_space_used(const struct extent_buffer *l, int start, int nr) { int data_len; int nritems = btrfs_header_nritems(l); @@ -3097,7 +3097,7 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr) * the start of the leaf data. IOW, how much room * the leaf has left for both items and data */ -noinline int btrfs_leaf_free_space(struct extent_buffer *leaf) +int btrfs_leaf_free_space(const struct extent_buffer *leaf) { struct btrfs_fs_info *fs_info = leaf->fs_info; int nritems = btrfs_header_nritems(leaf); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e81f10e12867..b7ab7fa2b73a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -685,7 +685,7 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) { return btrfs_next_old_item(root, p, 0); } -int btrfs_leaf_free_space(struct extent_buffer *leaf); +int btrfs_leaf_free_space(const struct extent_buffer *leaf); static inline int is_fstree(u64 rootid) { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5cd289de4e92..e1d5198d1f5e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -402,7 +402,7 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, } } - btrfs_print_leaf((struct extent_buffer *)eb); + btrfs_print_leaf(eb); btrfs_err(eb->fs_info, "eb %llu iref 0x%lx invalid extent inline ref type %d", eb->start, (unsigned long)iref, type); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 497b9dbd8a13..aa06d9ca911d 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -49,7 +49,7 @@ const char *btrfs_root_name(const struct btrfs_key *key, char *buf) return buf; } -static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) +static void print_chunk(const struct extent_buffer *eb, struct btrfs_chunk *chunk) { int num_stripes = btrfs_chunk_num_stripes(eb, chunk); int i; @@ -62,7 +62,7 @@ static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) btrfs_stripe_offset_nr(eb, chunk, i)); } } -static void print_dev_item(struct extent_buffer *eb, +static void print_dev_item(const struct extent_buffer *eb, struct btrfs_dev_item *dev_item) { pr_info("\t\tdev item devid %llu total_bytes %llu bytes used %llu\n", @@ -70,7 +70,7 @@ static void print_dev_item(struct extent_buffer *eb, btrfs_device_total_bytes(eb, dev_item), btrfs_device_bytes_used(eb, dev_item)); } -static void print_extent_data_ref(struct extent_buffer *eb, +static void print_extent_data_ref(const struct extent_buffer *eb, struct btrfs_extent_data_ref *ref) { pr_cont("extent data backref root %llu objectid %llu offset %llu count %u\n", @@ -80,7 +80,7 @@ static void print_extent_data_ref(struct extent_buffer *eb, btrfs_extent_data_ref_count(eb, ref)); } -static void print_extent_item(struct extent_buffer *eb, int slot, int type) +static void print_extent_item(const struct extent_buffer *eb, int slot, int type) { struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; @@ -169,7 +169,7 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type) WARN_ON(ptr > end); } -static void print_uuid_item(struct extent_buffer *l, unsigned long offset, +static void print_uuid_item(const struct extent_buffer *l, unsigned long offset, u32 item_size) { if (!IS_ALIGNED(item_size, sizeof(u64))) { @@ -191,7 +191,7 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset, * Helper to output refs and locking status of extent buffer. Useful to debug * race condition related problems. */ -static void print_eb_refs_lock(struct extent_buffer *eb) +static void print_eb_refs_lock(const struct extent_buffer *eb) { #ifdef CONFIG_BTRFS_DEBUG btrfs_info(eb->fs_info, "refs %u lock_owner %u current %u", @@ -199,7 +199,7 @@ static void print_eb_refs_lock(struct extent_buffer *eb) #endif } -void btrfs_print_leaf(struct extent_buffer *l) +void btrfs_print_leaf(const struct extent_buffer *l) { struct btrfs_fs_info *fs_info; int i; @@ -355,7 +355,7 @@ void btrfs_print_leaf(struct extent_buffer *l) } } -void btrfs_print_tree(struct extent_buffer *c, bool follow) +void btrfs_print_tree(const struct extent_buffer *c, bool follow) { struct btrfs_fs_info *fs_info; int i; u32 nr; diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index 8c3e9319ec4e..c42bc666d5ee 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -9,8 +9,8 @@ /* Buffer size to contain tree name and possibly additional data (offset) */ #define BTRFS_ROOT_NAME_BUF_LEN 48 -void btrfs_print_leaf(struct extent_buffer *l); -void btrfs_print_tree(struct extent_buffer *c, bool follow); +void btrfs_print_leaf(const struct extent_buffer *l); +void btrfs_print_tree(const struct extent_buffer *c, bool follow); const char *btrfs_root_name(const struct btrfs_key *key, char *buf); #endif -- cgit v1.2.3 From eee3b811784e52f06ccb3e1c4518a9907627d4fe Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 27 Apr 2023 20:16:28 +0800 Subject: btrfs: improve leaf dump and error handling Improve the leaf dump behavior by: - Always dump the leaf first, then the error message - Output the slot number if possible Especially in __btrfs_free_extent() the leaf dump of extent tree can be pretty large. With an extra slot number it's much easier to locate the problem. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 4 +- fs/btrfs/extent-tree.c | 123 +++++++++++++++++++++++-------------------------- 2 files changed, 59 insertions(+), 68 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f76da470d60b..4b33bc087fc7 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2633,6 +2633,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, if (slot > 0) { btrfs_item_key(eb, &disk_key, slot - 1); if (unlikely(comp_keys(&disk_key, new_key) >= 0)) { + btrfs_print_leaf(eb); btrfs_crit(fs_info, "slot %u key (%llu %u %llu) new key (%llu %u %llu)", slot, btrfs_disk_key_objectid(&disk_key), @@ -2640,13 +2641,13 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, btrfs_disk_key_offset(&disk_key), new_key->objectid, new_key->type, new_key->offset); - btrfs_print_leaf(eb); BUG(); } } if (slot < btrfs_header_nritems(eb) - 1) { btrfs_item_key(eb, &disk_key, slot + 1); if (unlikely(comp_keys(&disk_key, new_key) <= 0)) { + btrfs_print_leaf(eb); btrfs_crit(fs_info, "slot %u key (%llu %u %llu) new key (%llu %u %llu)", slot, btrfs_disk_key_objectid(&disk_key), @@ -2654,7 +2655,6 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, btrfs_disk_key_offset(&disk_key), new_key->objectid, new_key->type, new_key->offset); - btrfs_print_leaf(eb); BUG(); } } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e1d5198d1f5e..1c326d530764 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1164,15 +1164,10 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, * should not happen at all. */ if (owner < BTRFS_FIRST_FREE_OBJECTID) { + btrfs_print_leaf(path->nodes[0]); btrfs_crit(trans->fs_info, -"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu", - bytenr, num_bytes, root_objectid); - if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { - WARN_ON(1); - btrfs_crit(trans->fs_info, - "path->slots[0]=%d path->nodes[0]:", path->slots[0]); - btrfs_print_leaf(path->nodes[0]); - } +"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u", + bytenr, num_bytes, root_objectid, path->slots[0]); return -EUCLEAN; } update_inline_extent_backref(path, iref, refs_to_add, extent_op); @@ -2838,6 +2833,13 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, return ret; } +#define abort_and_dump(trans, path, fmt, args...) \ +({ \ + btrfs_abort_transaction(trans, -EUCLEAN); \ + btrfs_print_leaf(path->nodes[0]); \ + btrfs_crit(trans->fs_info, fmt, ##args); \ +}) + /* * Drop one or more refs of @node. * @@ -2978,10 +2980,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (!found_extent) { if (iref) { - btrfs_crit(info, -"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref"); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, +"invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref", + path->slots[0]); + ret = -EUCLEAN; + goto out; } /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, extent_root, path, @@ -3029,11 +3032,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } if (ret) { - btrfs_err(info, - "umm, got %d back from search, was looking for %llu", - ret, bytenr); if (ret > 0) btrfs_print_leaf(path->nodes[0]); + btrfs_err(info, + "umm, got %d back from search, was looking for %llu, slot %d", + ret, bytenr, path->slots[0]); } if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -3042,12 +3045,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, extent_slot = path->slots[0]; } } else if (WARN_ON(ret == -ENOENT)) { - btrfs_print_leaf(path->nodes[0]); - btrfs_err(info, - "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", - bytenr, parent, root_objectid, owner_objectid, - owner_offset); - btrfs_abort_transaction(trans, ret); + abort_and_dump(trans, path, +"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d", + bytenr, parent, root_objectid, owner_objectid, + owner_offset, path->slots[0]); goto out; } else { btrfs_abort_transaction(trans, ret); @@ -3067,14 +3068,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && key.type == BTRFS_EXTENT_ITEM_KEY) { struct btrfs_tree_block_info *bi; + if (item_size < sizeof(*ei) + sizeof(*bi)) { - btrfs_crit(info, -"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu", - key.objectid, key.type, key.offset, - owner_objectid, item_size, - sizeof(*ei) + sizeof(*bi)); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, +"invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu", + key.objectid, key.type, key.offset, + path->slots[0], owner_objectid, item_size, + sizeof(*ei) + sizeof(*bi)); + ret = -EUCLEAN; + goto out; } bi = (struct btrfs_tree_block_info *)(ei + 1); WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); @@ -3082,11 +3084,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, refs = btrfs_extent_refs(leaf, ei); if (refs < refs_to_drop) { - btrfs_crit(info, - "trying to drop %d refs but we only have %llu for bytenr %llu", - refs_to_drop, refs, bytenr); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, + "trying to drop %d refs but we only have %llu for bytenr %llu slot %u", + refs_to_drop, refs, bytenr, path->slots[0]); + ret = -EUCLEAN; + goto out; } refs -= refs_to_drop; @@ -3099,10 +3101,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, */ if (iref) { if (!found_extent) { - btrfs_crit(info, -"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found"); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, +"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u", + path->slots[0]); + ret = -EUCLEAN; + goto out; } } else { btrfs_set_extent_refs(leaf, ei, refs); @@ -3121,21 +3124,21 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (found_extent) { if (is_data && refs_to_drop != extent_data_ref_count(path, iref)) { - btrfs_crit(info, - "invalid refs_to_drop, current refs %u refs_to_drop %u", - extent_data_ref_count(path, iref), - refs_to_drop); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, + "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u", + extent_data_ref_count(path, iref), + refs_to_drop, path->slots[0]); + ret = -EUCLEAN; + goto out; } if (iref) { if (path->slots[0] != extent_slot) { - btrfs_crit(info, -"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref", - key.objectid, key.type, - key.offset); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, +"invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref", + key.objectid, key.type, + key.offset, path->slots[0]); + ret = -EUCLEAN; + goto out; } } else { /* @@ -3145,10 +3148,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ] */ if (path->slots[0] != extent_slot + 1) { - btrfs_crit(info, - "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM"); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, + "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM", + path->slots[0]); + ret = -EUCLEAN; + goto out; } path->slots[0] = extent_slot; num_to_del = 2; @@ -3170,19 +3174,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); return ret; -err_dump: - /* - * Leaf dump can take up a lot of log buffer, so we only do full leaf - * dump for debug build. - */ - if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { - btrfs_crit(info, "path->slots[0]=%d extent_slot=%d", - path->slots[0], extent_slot); - btrfs_print_leaf(path->nodes[0]); - } - - btrfs_free_path(path); - return -EUCLEAN; } /* -- cgit v1.2.3 From 4aec05fa5a190d641664c2bca8ad270cf8190b8c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sat, 29 Apr 2023 16:07:11 -0400 Subject: btrfs: remove level argument from btrfs_set_block_flags We just pass in btrfs_header_level(eb) for the level, and we're passing in the eb already, so simply get the level from the eb inside of btrfs_set_block_flags. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 5 +---- fs/btrfs/extent-tree.c | 7 +++---- fs/btrfs/extent-tree.h | 2 +- 3 files changed, 5 insertions(+), 9 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4b33bc087fc7..0c83cd78a483 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -464,10 +464,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, return ret; } if (new_flags != 0) { - int level = btrfs_header_level(buf); - - ret = btrfs_set_disk_extent_flags(trans, buf, - new_flags, level); + ret = btrfs_set_disk_extent_flags(trans, buf, new_flags); if (ret) return ret; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 581ddb24e113..99b9ab5ccf93 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2152,10 +2152,10 @@ out: } int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct extent_buffer *eb, u64 flags, - int level) + struct extent_buffer *eb, u64 flags) { struct btrfs_delayed_extent_op *extent_op; + int level = btrfs_header_level(eb); int ret; extent_op = btrfs_alloc_delayed_extent_op(); @@ -5095,8 +5095,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_set_disk_extent_flags(trans, eb, flag, - btrfs_header_level(eb)); + ret = btrfs_set_disk_extent_flags(trans, eb, flag); BUG_ON(ret); /* -ENOMEM */ wc->flags[level] |= flag; } diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 0c958fc1b3b8..429d5c570061 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -141,7 +141,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct extent_buffer *eb, u64 flags, int level); + struct extent_buffer *eb, u64 flags); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, -- cgit v1.2.3 From b3cbfb0dd4a80c359701280f6acfa37131d8ee8b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sat, 29 Apr 2023 16:07:20 -0400 Subject: btrfs: add a btrfs_csum_type_size helper This is needed in btrfs-progs for the tools that convert the checksum types for file systems and a few other things. We don't have it in the kernel as we just want to get the size for the super blocks type. However I don't want to have to manually add this every time we sync ctree.c into btrfs-progs, so add the helper in the kernel with a note so it doesn't get removed by a later cleanup. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 8 +++++++- fs/btrfs/ctree.h | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0c83cd78a483..ddd22f16cdff 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -150,13 +150,19 @@ static inline void copy_leaf_items(const struct extent_buffer *dst, nr_items * sizeof(struct btrfs_item)); } +/* This exists for btrfs-progs usages. */ +u16 btrfs_csum_type_size(u16 type) +{ + return btrfs_csums[type].size; +} + int btrfs_super_csum_size(const struct btrfs_super_block *s) { u16 t = btrfs_super_csum_type(s); /* * csum type is validated at mount time */ - return btrfs_csums[t].size; + return btrfs_csum_type_size(t); } const char *btrfs_super_csum_name(u16 csum_type) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b7ab7fa2b73a..717fca1797d6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -701,6 +701,7 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; } +u16 btrfs_csum_type_size(u16 type); int btrfs_super_csum_size(const struct btrfs_super_block *s); const char *btrfs_super_csum_name(u16 csum_type); const char *btrfs_super_csum_driver(u16 csum_type); -- cgit v1.2.3 From 016f9d0b744202eb170ed91e995fc757dd4adeeb Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sat, 29 Apr 2023 16:07:21 -0400 Subject: btrfs: rename del_ptr to btrfs_del_ptr and export it This exists internal to ctree.c, however btrfs check needs to use it for some of its operations. I'd rather not duplicate that code inside of btrfs check as this is low level and I want to keep this code in one place, so rename the function to btrfs_del_ptr and export it so that it can be used inside of btrfs-progs safely. Add a comment to make sure this doesn't get removed by a future cleanup. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 16 ++++++++-------- fs/btrfs/ctree.h | 2 ++ 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ddd22f16cdff..2f2071d64c52 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -37,8 +37,6 @@ static int push_node_left(struct btrfs_trans_handle *trans, static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); -static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, - int level, int slot); static const struct btrfs_csums { u16 size; @@ -1122,7 +1120,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(right) == 0) { btrfs_clear_buffer_dirty(trans, right); btrfs_tree_unlock(right); - del_ptr(root, path, level + 1, pslot + 1); + btrfs_del_ptr(root, path, level + 1, pslot + 1); root_sub_used(root, right->len); btrfs_free_tree_block(trans, btrfs_root_id(root), right, 0, 1); @@ -1168,7 +1166,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(mid) == 0) { btrfs_clear_buffer_dirty(trans, mid); btrfs_tree_unlock(mid); - del_ptr(root, path, level + 1, pslot); + btrfs_del_ptr(root, path, level + 1, pslot); root_sub_used(root, mid->len); btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); free_extent_buffer_stale(mid); @@ -4357,9 +4355,11 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, * * the tree should have been previously balanced so the deletion does not * empty a node. + * + * This is exported for use inside btrfs-progs, don't un-export it. */ -static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, - int level, int slot) +void btrfs_del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, + int slot) { struct extent_buffer *parent = path->nodes[level]; u32 nritems; @@ -4414,7 +4414,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, struct extent_buffer *leaf) { WARN_ON(btrfs_header_generation(leaf) != trans->transid); - del_ptr(root, path, 1, path->slots[1]); + btrfs_del_ptr(root, path, 1, path->slots[1]); /* * btrfs_free_extent is expensive, we want to make sure we @@ -4500,7 +4500,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, /* push_leaf_left fixes the path. * make sure the path still points to our leaf - * for possible call to del_ptr below + * for possible call to btrfs_del_ptr below */ slot = path->slots[1]; atomic_inc(&leaf->refs); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 717fca1797d6..5af61480dde6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -541,6 +541,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct extent_buffer **cow_ret, u64 new_root_objectid); int btrfs_block_can_be_shared(struct btrfs_root *root, struct extent_buffer *buf); +void btrfs_del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, + int slot); void btrfs_extend_item(struct btrfs_path *path, u32 data_size); void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, -- cgit v1.2.3 From 5cead5422a0e3d13b0bcee986c0f5c4ebb94100b Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 1 Jun 2023 11:55:14 -0700 Subject: btrfs: insert tree mod log move in push_node_left There is a fairly unlikely race condition in tree mod log rewind that can result in a kernel panic which has the following trace: [530.569] BTRFS critical (device sda3): unable to find logical 0 length 4096 [530.585] BTRFS critical (device sda3): unable to find logical 0 length 4096 [530.602] BUG: kernel NULL pointer dereference, address: 0000000000000002 [530.618] #PF: supervisor read access in kernel mode [530.629] #PF: error_code(0x0000) - not-present page [530.641] PGD 0 P4D 0 [530.647] Oops: 0000 [#1] SMP [530.654] CPU: 30 PID: 398973 Comm: below Kdump: loaded Tainted: G S O K 5.12.0-0_fbk13_clang_7455_gb24de3bdb045 #1 [530.680] Hardware name: Quanta Mono Lake-M.2 SATA 1HY9U9Z001G/Mono Lake-M.2 SATA, BIOS F20_3A15 08/16/2017 [530.703] RIP: 0010:__btrfs_map_block+0xaa/0xd00 [530.755] RSP: 0018:ffffc9002c2f7600 EFLAGS: 00010246 [530.767] RAX: ffffffffffffffea RBX: ffff888292e41000 RCX: f2702d8b8be15100 [530.784] RDX: ffff88885fda6fb8 RSI: ffff88885fd973c8 RDI: ffff88885fd973c8 [530.800] RBP: ffff888292e410d0 R08: ffffffff82fd7fd0 R09: 00000000fffeffff [530.816] R10: ffffffff82e57fd0 R11: ffffffff82e57d70 R12: 0000000000000000 [530.832] R13: 0000000000001000 R14: 0000000000001000 R15: ffffc9002c2f76f0 [530.848] FS: 00007f38d64af000(0000) GS:ffff88885fd80000(0000) knlGS:0000000000000000 [530.866] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [530.880] CR2: 0000000000000002 CR3: 00000002b6770004 CR4: 00000000003706e0 [530.896] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [530.912] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [530.928] Call Trace: [530.934] ? btrfs_printk+0x13b/0x18c [530.943] ? btrfs_bio_counter_inc_blocked+0x3d/0x130 [530.955] btrfs_map_bio+0x75/0x330 [530.963] ? kmem_cache_alloc+0x12a/0x2d0 [530.973] ? btrfs_submit_metadata_bio+0x63/0x100 [530.984] btrfs_submit_metadata_bio+0xa4/0x100 [530.995] submit_extent_page+0x30f/0x360 [531.004] read_extent_buffer_pages+0x49e/0x6d0 [531.015] ? submit_extent_page+0x360/0x360 [531.025] btree_read_extent_buffer_pages+0x5f/0x150 [531.037] read_tree_block+0x37/0x60 [531.046] read_block_for_search+0x18b/0x410 [531.056] btrfs_search_old_slot+0x198/0x2f0 [531.066] resolve_indirect_ref+0xfe/0x6f0 [531.076] ? ulist_alloc+0x31/0x60 [531.084] ? kmem_cache_alloc_trace+0x12e/0x2b0 [531.095] find_parent_nodes+0x720/0x1830 [531.105] ? ulist_alloc+0x10/0x60 [531.113] iterate_extent_inodes+0xea/0x370 [531.123] ? btrfs_previous_extent_item+0x8f/0x110 [531.134] ? btrfs_search_path_in_tree+0x240/0x240 [531.146] iterate_inodes_from_logical+0x98/0xd0 [531.157] ? btrfs_search_path_in_tree+0x240/0x240 [531.168] btrfs_ioctl_logical_to_ino+0xd9/0x180 [531.179] btrfs_ioctl+0xe2/0x2eb0 This occurs when logical inode resolution takes a tree mod log sequence number, and then while backref walking hits a rewind on a busy node which has the following sequence of tree mod log operations (numbers filled in from a specific example, but they are somewhat arbitrary) REMOVE_WHILE_FREEING slot 532 REMOVE_WHILE_FREEING slot 531 REMOVE_WHILE_FREEING slot 530 ... REMOVE_WHILE_FREEING slot 0 REMOVE slot 455 REMOVE slot 454 REMOVE slot 453 ... REMOVE slot 0 ADD slot 455 ADD slot 454 ADD slot 453 ... ADD slot 0 MOVE src slot 0 -> dst slot 456 nritems 533 REMOVE slot 455 REMOVE slot 454 REMOVE slot 453 ... REMOVE slot 0 When this sequence gets applied via btrfs_tree_mod_log_rewind, it allocates a fresh rewind eb, and first inserts the correct key info for the 533 elements, then overwrites the first 456 of them, then decrements the count by 456 via the add ops, then rewinds the move by doing a memmove from 456:988->0:532. We have never written anything past 532, so that memmove writes garbage into the 0:532 range. In practice, this results in a lot of fully 0 keys. The rewind then puts valid keys into slots 0:455 with the last removes, but 456:532 are still invalid. When search_old_slot uses this eb, if it uses one of those invalid slots, it can then read the extent buffer and issue a bio for offset 0 which ultimately panics looking up extent mappings. This bad tree mod log sequence gets generated when the node balancing code happens to do a balance_node_right followed by a push_node_left while logging in the tree mod log. Illustrated for ebs L and R (left and right): L R start: [XXX|YYY|...] [ZZZ|...|...] balance_node_right: [XXX|YYY|...] [...|ZZZ|...] move Z to make room for Y [XXX|...|...] [YYY|ZZZ|...] copy Y from L to R push_node_left: [XXX|YYY|...] [...|ZZZ|...] copy Y from R to L [XXX|YYY|...] [ZZZ|...|...] move Z into emptied space (NOT LOGGED!) This is because balance_node_right logs a move, but push_node_left explicitly doesn't. That is because logging the move would remove the overwritten src < dst range in the right eb, which was already logged when we called btrfs_tree_mod_log_eb_copy. The correct sequence would include a move from 456:988 to 0:532 after remove 0:455 and before removing 0:532. Reversing that sequence would entail creating keys for 0:532, then moving those keys out to 456:988, then creating more keys for 0:455. i.e., REMOVE_WHILE_FREEING slot 532 REMOVE_WHILE_FREEING slot 531 REMOVE_WHILE_FREEING slot 530 ... REMOVE_WHILE_FREEING slot 0 MOVE src slot 456 -> dst slot 0 nritems 533 REMOVE slot 455 REMOVE slot 454 REMOVE slot 453 ... REMOVE slot 0 ADD slot 455 ADD slot 454 ADD slot 453 ... ADD slot 0 MOVE src slot 0 -> dst slot 456 nritems 533 REMOVE slot 455 REMOVE slot 454 REMOVE slot 453 ... REMOVE slot 0 Fix this to log the move but avoid the double remove by putting all the logging logic in btrfs_tree_mod_log_eb_copy which has enough information to detect these cases and properly log moves, removes, and adds. Leave btrfs_tree_mod_log_insert_move to handle insert_ptr and delete_ptr's tree mod logging. (Un)fortunately, this is quite difficult to reproduce, and I was only able to reproduce it by adding sleeps in btrfs_search_old_slot that would encourage more log rewinding during ino_to_logical ioctls. I was able to hit the warning in the previous patch in the series without the fix quite quickly, but not after this patch. CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Filipe Manana Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 11 +++++--- fs/btrfs/tree-mod-log.c | 73 +++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 71 insertions(+), 13 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 2f2071d64c52..385524224037 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2785,8 +2785,8 @@ static int push_node_left(struct btrfs_trans_handle *trans, if (push_items < src_nritems) { /* - * Don't call btrfs_tree_mod_log_insert_move() here, key removal - * was already fully logged by btrfs_tree_mod_log_eb_copy() above. + * btrfs_tree_mod_log_eb_copy handles logging the move, so we + * don't need to do an explicit tree mod log operation for it. */ memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0), btrfs_node_key_ptr_offset(src, push_items), @@ -2847,8 +2847,11 @@ static int balance_node_right(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); return ret; } - ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); - BUG_ON(ret < 0); + + /* + * btrfs_tree_mod_log_eb_copy handles logging the move, so we don't + * need to do an explicit tree mod log operation for it. + */ memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items), btrfs_node_key_ptr_offset(dst, 0), (dst_nritems) * diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 39545d1d2e9a..07c086f9e35e 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -248,6 +248,26 @@ int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, return ret; } +static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb, + int dst_slot, int src_slot, + int nr_items) +{ + struct tree_mod_elem *tm; + + tm = kzalloc(sizeof(*tm), GFP_NOFS); + if (!tm) + return ERR_PTR(-ENOMEM); + + tm->logical = eb->start; + tm->slot = src_slot; + tm->move.dst_slot = dst_slot; + tm->move.nr_items = nr_items; + tm->op = BTRFS_MOD_LOG_MOVE_KEYS; + RB_CLEAR_NODE(&tm->node); + + return tm; +} + int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items) @@ -265,18 +285,13 @@ int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, if (!tm_list) return -ENOMEM; - tm = kzalloc(sizeof(*tm), GFP_NOFS); - if (!tm) { - ret = -ENOMEM; + tm = tree_mod_log_alloc_move(eb, dst_slot, src_slot, nr_items); + if (IS_ERR(tm)) { + ret = PTR_ERR(tm); + tm = NULL; goto free_tms; } - tm->logical = eb->start; - tm->slot = src_slot; - tm->move.dst_slot = dst_slot; - tm->move.nr_items = nr_items; - tm->op = BTRFS_MOD_LOG_MOVE_KEYS; - for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING); @@ -489,6 +504,10 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, struct tree_mod_elem **tm_list_add, **tm_list_rem; int i; bool locked = false; + struct tree_mod_elem *dst_move_tm = NULL; + struct tree_mod_elem *src_move_tm = NULL; + u32 dst_move_nr_items = btrfs_header_nritems(dst) - dst_offset; + u32 src_move_nr_items = btrfs_header_nritems(src) - (src_offset + nr_items); if (!tree_mod_need_log(fs_info, NULL)) return 0; @@ -501,6 +520,26 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, if (!tm_list) return -ENOMEM; + if (dst_move_nr_items) { + dst_move_tm = tree_mod_log_alloc_move(dst, dst_offset + nr_items, + dst_offset, dst_move_nr_items); + if (IS_ERR(dst_move_tm)) { + ret = PTR_ERR(dst_move_tm); + dst_move_tm = NULL; + goto free_tms; + } + } + if (src_move_nr_items) { + src_move_tm = tree_mod_log_alloc_move(src, src_offset, + src_offset + nr_items, + src_move_nr_items); + if (IS_ERR(src_move_tm)) { + ret = PTR_ERR(src_move_tm); + src_move_tm = NULL; + goto free_tms; + } + } + tm_list_add = tm_list; tm_list_rem = tm_list + nr_items; for (i = 0; i < nr_items; i++) { @@ -523,6 +562,11 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, goto free_tms; locked = true; + if (dst_move_tm) { + ret = tree_mod_log_insert(fs_info, dst_move_tm); + if (ret) + goto free_tms; + } for (i = 0; i < nr_items; i++) { ret = tree_mod_log_insert(fs_info, tm_list_rem[i]); if (ret) @@ -531,6 +575,11 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, if (ret) goto free_tms; } + if (src_move_tm) { + ret = tree_mod_log_insert(fs_info, src_move_tm); + if (ret) + goto free_tms; + } write_unlock(&fs_info->tree_mod_log_lock); kfree(tm_list); @@ -538,6 +587,12 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, return 0; free_tms: + if (dst_move_tm && !RB_EMPTY_NODE(&dst_move_tm->node)) + rb_erase(&dst_move_tm->node, &fs_info->tree_mod_log); + kfree(dst_move_tm); + if (src_move_tm && !RB_EMPTY_NODE(&src_move_tm->node)) + rb_erase(&src_move_tm->node, &fs_info->tree_mod_log); + kfree(src_move_tm); for (i = 0; i < nr_items * 2; i++) { if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); -- cgit v1.2.3 From d09c51521f22f9cbdfb1cf63e5c456077c622c84 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:37 +0100 Subject: btrfs: add missing error handling when logging operation while COWing extent buffer When COWing an extent buffer that is not the root node, we need to log in the tree mod log that we replaced a pointer in the parent node, otherwise a tree mod log user doing a search on the b+tree can return incorrect results (that miss something). We are doing the call to btrfs_tree_mod_log_insert_key() but we totally ignore its return value. So fix this by adding the missing error handling, resulting in a transaction abort and freeing the COWed extent buffer. Fixes: f230475e62f7 ("Btrfs: put all block modifications into the tree mod log") CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 385524224037..7f7f13965fe9 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -595,8 +595,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, add_root_to_dirty_list(root); } else { WARN_ON(trans->transid != btrfs_header_generation(parent)); - btrfs_tree_mod_log_insert_key(parent, parent_slot, - BTRFS_MOD_LOG_KEY_REPLACE); + ret = btrfs_tree_mod_log_insert_key(parent, parent_slot, + BTRFS_MOD_LOG_KEY_REPLACE); + if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); + btrfs_abort_transaction(trans, ret); + return ret; + } btrfs_set_node_blockptr(parent, parent_slot, cow->start); btrfs_set_node_ptr_generation(parent, parent_slot, -- cgit v1.2.3 From ede600e497b1461d06d22a7d17703d9096868bc3 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:38 +0100 Subject: btrfs: fix extent buffer leak after tree mod log failure at split_node() At split_node(), if we fail to log the tree mod log copy operation, we return without unlocking the split extent buffer we just allocated and without decrementing the reference we own on it. Fix this by unlocking it and decrementing the ref count before returning. Fixes: 5de865eebb83 ("Btrfs: fix tree mod logging") CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 7f7f13965fe9..8496535828de 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3053,6 +3053,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); if (ret) { + btrfs_tree_unlock(split); + free_extent_buffer(split); btrfs_abort_transaction(trans, ret); return ret; } -- cgit v1.2.3 From 40b0a749388517de244643c09bdbb98f7dcb6ef1 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:40 +0100 Subject: btrfs: do not BUG_ON() on tree mod log failure at __btrfs_cow_block() At __btrfs_cow_block(), instead of doing a BUG_ON() in case we fail to record a tree mod log root insertion operation, do a transaction abort instead. There's really no need for the BUG_ON(), we can properly release all resources in this context and turn the filesystem to RO mode and in an error state instead. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 8496535828de..d6c29564ce49 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -584,9 +584,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) parent_start = buf->start; - atomic_inc(&cow->refs); ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); + btrfs_abort_transaction(trans, ret); + return ret; + } + atomic_inc(&cow->refs); rcu_assign_pointer(root->node, cow); btrfs_free_tree_block(trans, btrfs_root_id(root), buf, -- cgit v1.2.3 From 39020d8abc7ec62c4de9b260e3d10d4a1c2478ce Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:41 +0100 Subject: btrfs: do not BUG_ON() on tree mod log failure at balance_level() At balance_level(), instead of doing a BUG_ON() in case we fail to record tree mod log operations, do a transaction abort and return the error to the callers. There's really no need for the BUG_ON() as we can release all resources in this context, and we have to abort because other future tree searches that use the tree mod log (btrfs_search_old_slot()) may get inconsistent results if other operations modify the tree after that failure and before the tree mod log based search. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d6c29564ce49..d60b28c6bd1b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1054,7 +1054,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } ret = btrfs_tree_mod_log_insert_root(root->node, child, true); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_tree_unlock(child); + free_extent_buffer(child); + btrfs_abort_transaction(trans, ret); + goto enospc; + } rcu_assign_pointer(root->node, child); add_root_to_dirty_list(root); @@ -1142,7 +1147,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_node_key(right, &right_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, BTRFS_MOD_LOG_KEY_REPLACE); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto enospc; + } btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(parent); } @@ -1188,7 +1196,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_node_key(mid, &mid_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, BTRFS_MOD_LOG_KEY_REPLACE); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto enospc; + } btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(parent); } -- cgit v1.2.3 From daefe4d435d75118af302dae650547b8b760da0b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:42 +0100 Subject: btrfs: rename enospc label to out at balance_level() At balance_level() we have this 'enospc' label where we jump to in case we get an error at several places. However that error is certainly not -ENOSPC in call cases, it can be -EIO or -ENOMEM when reading a child extent buffer for example, or -ENOMEM when trying to record tree mod log operations. So to make this less confusing, rename the label to 'out'. Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d60b28c6bd1b..e98f9e205e25 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1041,7 +1041,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (IS_ERR(child)) { ret = PTR_ERR(child); btrfs_handle_fs_error(fs_info, ret, NULL); - goto enospc; + goto out; } btrfs_tree_lock(child); @@ -1050,7 +1050,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (ret) { btrfs_tree_unlock(child); free_extent_buffer(child); - goto enospc; + goto out; } ret = btrfs_tree_mod_log_insert_root(root->node, child, true); @@ -1058,7 +1058,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_unlock(child); free_extent_buffer(child); btrfs_abort_transaction(trans, ret); - goto enospc; + goto out; } rcu_assign_pointer(root->node, child); @@ -1087,7 +1087,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (IS_ERR(left)) { ret = PTR_ERR(left); left = NULL; - goto enospc; + goto out; } __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); @@ -1096,7 +1096,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NESTING_LEFT_COW); if (wret) { ret = wret; - goto enospc; + goto out; } } @@ -1105,7 +1105,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (IS_ERR(right)) { ret = PTR_ERR(right); right = NULL; - goto enospc; + goto out; } __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); @@ -1114,7 +1114,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NESTING_RIGHT_COW); if (wret) { ret = wret; - goto enospc; + goto out; } } @@ -1149,7 +1149,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_MOD_LOG_KEY_REPLACE); if (ret < 0) { btrfs_abort_transaction(trans, ret); - goto enospc; + goto out; } btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -1168,12 +1168,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (!left) { ret = -EROFS; btrfs_handle_fs_error(fs_info, ret, NULL); - goto enospc; + goto out; } wret = balance_node_right(trans, mid, left); if (wret < 0) { ret = wret; - goto enospc; + goto out; } if (wret == 1) { wret = push_node_left(trans, left, mid, 1); @@ -1198,7 +1198,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_MOD_LOG_KEY_REPLACE); if (ret < 0) { btrfs_abort_transaction(trans, ret); - goto enospc; + goto out; } btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(parent); @@ -1225,7 +1225,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (orig_ptr != btrfs_node_blockptr(path->nodes[level], path->slots[level])) BUG(); -enospc: +out: if (right) { btrfs_tree_unlock(right); free_extent_buffer(right); -- cgit v1.2.3 From 87b8e9d06e3315c6f8d478fc8fc0dc4d25cf0e09 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:43 +0100 Subject: btrfs: avoid unnecessarily setting the fs to RO and error state at balance_level() At balance_level(), when trying to promote a child node to a root node, if we fail to read the child we call btrfs_handle_fs_error(), which turns the fs to RO mode and sets it to error state as well, causing any ongoing transaction to abort. This however is not necessary because at that point we have not made any change yet at balance_level(), so any error reading the child node does not leaves us in any inconsistent state. Therefore we can just return the error to the caller. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e98f9e205e25..4dcdcf25c3fe 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1040,7 +1040,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, child = btrfs_read_node_slot(mid, 0); if (IS_ERR(child)) { ret = PTR_ERR(child); - btrfs_handle_fs_error(fs_info, ret, NULL); goto out; } -- cgit v1.2.3 From 725026ed593f1b6da66fe7e3777cd891dbf7343f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:44 +0100 Subject: btrfs: abort transaction at balance_level() when left child is missing At balance_level() we are calling btrfs_handle_fs_error() when the middle child only has 1 item and the left child is missing, however we can simply use btrfs_abort_transaction(), which achieves the same purposes: to turn the fs to error state, abort the current transaction and turn the fs to RO mode. Besides that, btrfs_abort_transaction() also prints a stack trace which makes it more useful. Also, as this is a highly unexpected case and it's about a b+tree inconsistency, change the error code from -EROFS to -EUCLEAN, tag the if branch as 'unlikely' and log an explicit error message. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4dcdcf25c3fe..00eea2925d1d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1164,9 +1164,13 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, * otherwise we would have pulled some pointers from the * right */ - if (!left) { - ret = -EROFS; - btrfs_handle_fs_error(fs_info, ret, NULL); + if (unlikely(!left)) { + btrfs_crit(fs_info, +"missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu", + parent->start, btrfs_header_level(parent), + mid->start, btrfs_root_id(root)); + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); goto out; } wret = balance_node_right(trans, mid, left); -- cgit v1.2.3 From eced687e224eb3cc5a501cf53ad9291337c8dbc5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:45 +0100 Subject: btrfs: abort transaction at update_ref_for_cow() when ref count is zero At update_ref_for_cow() we are calling btrfs_handle_fs_error() if we find that the extent buffer has an unexpected ref count of zero, however we can simply use btrfs_abort_transaction(), which achieves the same purposes: to turn the fs to error state, abort the current transaction and turn the fs to RO mode as well. Besides that, btrfs_abort_transaction() also prints a stack trace which makes it more useful. Also, as this is a very unexpected situation, indicating a serious corruption/inconsistency, tag the if branch as 'unlikely', set the error code to -EUCLEAN instead of -EROFS, and log an explicit message. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 00eea2925d1d..b643bd8656bf 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -421,9 +421,13 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, &refs, &flags); if (ret) return ret; - if (refs == 0) { - ret = -EROFS; - btrfs_handle_fs_error(fs_info, ret, NULL); + if (unlikely(refs == 0)) { + btrfs_crit(fs_info, + "found 0 references for tree block at bytenr %llu level %d root %llu", + buf->start, btrfs_header_level(buf), + btrfs_root_id(root)); + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); return ret; } } else { -- cgit v1.2.3 From 11d6ae03557e34dd2bc9e57d1a5139ab3c7be54f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:46 +0100 Subject: btrfs: do not BUG_ON() on tree mod log failures at push_nodes_for_insert() At push_nodes_for_insert(), instead of doing a BUG_ON() in case we fail to record tree mod log operations, do a transaction abort and return the error to the caller. There's really no need for the BUG_ON() as we can release all resources in this context, and we have to abort because other future tree searches that use the tree mod log (btrfs_search_old_slot()) may get inconsistent results if other operations modify the tree after that failure and before the tree mod log based search. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index b643bd8656bf..dc60ece85664 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1308,7 +1308,12 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_node_key(mid, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, BTRFS_MOD_LOG_KEY_REPLACE); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_tree_unlock(left); + free_extent_buffer(left); + btrfs_abort_transaction(trans, ret); + return ret; + } btrfs_set_node_key(parent, &disk_key, pslot); btrfs_mark_buffer_dirty(parent); if (btrfs_header_nritems(left) > orig_slot) { @@ -1363,7 +1368,12 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_node_key(right, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, BTRFS_MOD_LOG_KEY_REPLACE); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + btrfs_abort_transaction(trans, ret); + return ret; + } btrfs_set_node_key(parent, &disk_key, pslot + 1); btrfs_mark_buffer_dirty(parent); -- cgit v1.2.3 From f61aa7ba08abc09eaf8eb766d9469e9393c22dbf Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:47 +0100 Subject: btrfs: do not BUG_ON() on tree mod log failure at insert_new_root() At insert_new_root(), instead of doing a BUG_ON() in case we fail to record the tree mod log operation, just return the error to the callers after releasing the allocated tree block. At this point we haven't made any changes to the b+tree, so we haven't left it in an inconsistent state and therefore have no need to abort the transaction. All we need to do is to unlock and free the extent buffer we just allocated with the purpose of making it the new root. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index dc60ece85664..58325b8f544f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2964,7 +2964,12 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, old = root->node; ret = btrfs_tree_mod_log_insert_root(root->node, c, false); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); + btrfs_tree_unlock(c); + free_extent_buffer(c); + return ret; + } rcu_assign_pointer(root->node, c); /* the super has an extra ref to root->node */ -- cgit v1.2.3 From 50b5d1fc41da21a4ecf219f37fbca23c79020b08 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:48 +0100 Subject: btrfs: do not BUG_ON() on tree mod log failures at insert_ptr() At insert_ptr(), instead of doing a BUG_ON() in case we fail to record tree mod log operations, do a transaction abort and return the error to the callers. There's really no need for the BUG_ON() as we can release all resources in the context of all callers, and we have to abort because other future tree searches that use the tree mod log (btrfs_search_old_slot()) may get inconsistent results if other operations modify the tree after that failure and before the tree mod log based search. This implies making insert_ptr() return an int instead of void, and making all callers check for returned errors. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 71 +++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 52 insertions(+), 19 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 58325b8f544f..e075e6994d79 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2990,10 +2990,10 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, * slot and level indicate where you want the key to go, and * blocknr is the block the key points to. */ -static void insert_ptr(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_disk_key *key, u64 bytenr, - int slot, int level) +static int insert_ptr(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_disk_key *key, u64 bytenr, + int slot, int level) { struct extent_buffer *lower; int nritems; @@ -3009,7 +3009,10 @@ static void insert_ptr(struct btrfs_trans_handle *trans, if (level) { ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, slot, nritems - slot); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + return ret; + } } memmove_extent_buffer(lower, btrfs_node_key_ptr_offset(lower, slot + 1), @@ -3019,7 +3022,10 @@ static void insert_ptr(struct btrfs_trans_handle *trans, if (level) { ret = btrfs_tree_mod_log_insert_key(lower, slot, BTRFS_MOD_LOG_KEY_ADD); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + return ret; + } } btrfs_set_node_key(lower, key, slot); btrfs_set_node_blockptr(lower, slot, bytenr); @@ -3027,6 +3033,8 @@ static void insert_ptr(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(lower, slot, trans->transid); btrfs_set_header_nritems(lower, nritems + 1); btrfs_mark_buffer_dirty(lower); + + return 0; } /* @@ -3106,8 +3114,13 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); btrfs_mark_buffer_dirty(split); - insert_ptr(trans, path, &disk_key, split->start, - path->slots[level + 1] + 1, level + 1); + ret = insert_ptr(trans, path, &disk_key, split->start, + path->slots[level + 1] + 1, level + 1); + if (ret < 0) { + btrfs_tree_unlock(split); + free_extent_buffer(split); + return ret; + } if (path->slots[level] >= mid) { path->slots[level] -= mid; @@ -3584,16 +3597,17 @@ out: * split the path's leaf in two, making sure there is at least data_size * available for the resulting leaf level of the path. */ -static noinline void copy_for_split(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct extent_buffer *l, - struct extent_buffer *right, - int slot, int mid, int nritems) +static noinline int copy_for_split(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct extent_buffer *l, + struct extent_buffer *right, + int slot, int mid, int nritems) { struct btrfs_fs_info *fs_info = trans->fs_info; int data_copy_size; int rt_data_off; int i; + int ret; struct btrfs_disk_key disk_key; struct btrfs_map_token token; @@ -3618,7 +3632,9 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(l, mid); btrfs_item_key(right, &disk_key, 0); - insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); + ret = insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); + if (ret < 0) + return ret; btrfs_mark_buffer_dirty(right); btrfs_mark_buffer_dirty(l); @@ -3636,6 +3652,8 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, } BUG_ON(path->slots[0] < 0); + + return 0; } /* @@ -3834,8 +3852,13 @@ again: if (split == 0) { if (mid <= slot) { btrfs_set_header_nritems(right, 0); - insert_ptr(trans, path, &disk_key, - right->start, path->slots[1] + 1, 1); + ret = insert_ptr(trans, path, &disk_key, + right->start, path->slots[1] + 1, 1); + if (ret < 0) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3843,8 +3866,13 @@ again: path->slots[1] += 1; } else { btrfs_set_header_nritems(right, 0); - insert_ptr(trans, path, &disk_key, - right->start, path->slots[1], 1); + ret = insert_ptr(trans, path, &disk_key, + right->start, path->slots[1], 1); + if (ret < 0) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3860,7 +3888,12 @@ again: return ret; } - copy_for_split(trans, path, l, right, slot, mid, nritems); + ret = copy_for_split(trans, path, l, right, slot, mid, nritems); + if (ret < 0) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } if (split == 2) { BUG_ON(num_doubles != 0); -- cgit v1.2.3 From 751a27615ddaaf95519565d83bac65b8aafab9e8 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:49 +0100 Subject: btrfs: do not BUG_ON() on tree mod log failures at btrfs_del_ptr() At btrfs_del_ptr(), instead of doing a BUG_ON() in case we fail to record tree mod log operations, do a transaction abort and return the error to the callers. There's really no need for the BUG_ON() as we can release all resources in the context of all callers, and we have to abort because other future tree searches that use the tree mod log (btrfs_search_old_slot()) may get inconsistent results if other operations modify the tree after that failure and before the tree mod log based search. This implies btrfs_del_ptr() return an int instead of void, and making all callers check for returned errors. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 52 +++++++++++++++++++++++++++++++++++++++------------- fs/btrfs/ctree.h | 4 ++-- 2 files changed, 41 insertions(+), 15 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e075e6994d79..190dd90a88eb 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1139,7 +1139,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(right) == 0) { btrfs_clear_buffer_dirty(trans, right); btrfs_tree_unlock(right); - btrfs_del_ptr(root, path, level + 1, pslot + 1); + ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1); + if (ret < 0) { + free_extent_buffer_stale(right); + right = NULL; + goto out; + } root_sub_used(root, right->len); btrfs_free_tree_block(trans, btrfs_root_id(root), right, 0, 1); @@ -1192,7 +1197,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(mid) == 0) { btrfs_clear_buffer_dirty(trans, mid); btrfs_tree_unlock(mid); - btrfs_del_ptr(root, path, level + 1, pslot); + ret = btrfs_del_ptr(trans, root, path, level + 1, pslot); + if (ret < 0) { + free_extent_buffer_stale(mid); + mid = NULL; + goto out; + } root_sub_used(root, mid->len); btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); free_extent_buffer_stale(mid); @@ -4440,8 +4450,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, * * This is exported for use inside btrfs-progs, don't un-export it. */ -void btrfs_del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, - int slot) +int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot) { struct extent_buffer *parent = path->nodes[level]; u32 nritems; @@ -4452,7 +4462,10 @@ void btrfs_del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, if (level) { ret = btrfs_tree_mod_log_insert_move(parent, slot, slot + 1, nritems - slot - 1); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + return ret; + } } memmove_extent_buffer(parent, btrfs_node_key_ptr_offset(parent, slot), @@ -4462,7 +4475,10 @@ void btrfs_del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, } else if (level) { ret = btrfs_tree_mod_log_insert_key(parent, slot, BTRFS_MOD_LOG_KEY_REMOVE); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + return ret; + } } nritems--; @@ -4478,6 +4494,7 @@ void btrfs_del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, fixup_low_keys(path, &disk_key, level + 1); } btrfs_mark_buffer_dirty(parent); + return 0; } /* @@ -4490,13 +4507,17 @@ void btrfs_del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, * The path must have already been setup for deleting the leaf, including * all the proper balancing. path->nodes[1] must be locked. */ -static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *leaf) +static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *leaf) { + int ret; + WARN_ON(btrfs_header_generation(leaf) != trans->transid); - btrfs_del_ptr(root, path, 1, path->slots[1]); + ret = btrfs_del_ptr(trans, root, path, 1, path->slots[1]); + if (ret < 0) + return ret; /* * btrfs_free_extent is expensive, we want to make sure we @@ -4509,6 +4530,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, atomic_inc(&leaf->refs); btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); free_extent_buffer_stale(leaf); + return 0; } /* * delete the item at the leaf level in path. If that empties @@ -4558,7 +4580,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_level(leaf, 0); } else { btrfs_clear_buffer_dirty(trans, leaf); - btrfs_del_leaf(trans, root, path, leaf); + ret = btrfs_del_leaf(trans, root, path, leaf); + if (ret < 0) + return ret; } } else { int used = leaf_space_used(leaf, 0, nritems); @@ -4619,7 +4643,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (btrfs_header_nritems(leaf) == 0) { path->slots[1] = slot; - btrfs_del_leaf(trans, root, path, leaf); + ret = btrfs_del_leaf(trans, root, path, leaf); + if (ret < 0) + return ret; free_extent_buffer(leaf); ret = 0; } else { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5af61480dde6..f2d2b313bde5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -541,8 +541,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct extent_buffer **cow_ret, u64 new_root_objectid); int btrfs_block_can_be_shared(struct btrfs_root *root, struct extent_buffer *buf); -void btrfs_del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, - int slot); +int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot); void btrfs_extend_item(struct btrfs_path *path, u32 data_size); void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, -- cgit v1.2.3 From 7569141e8fa855b509e674456132b83bca5f087c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 9 Jun 2023 11:49:07 +0100 Subject: btrfs: replace BUG_ON() at split_item() with proper error handling There's no need to BUG_ON() at split_item() if the leaf does not have enough free space for the new item, we can just return -ENOSPC since the caller can deal with errors from split_item(). Also, as this is a very unlikely condition to happen, because the caller has invoked setup_leaf_for_split() before calling split_item(), surround the condition with a WARN_ON() which makes it easier to notice this unexpected condition and tags the if branch with 'unlikely' as well. I've actually once hit this BUG_ON() with some incorrect code changes I had, which was very inconvenient as it required rebooting the VM. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 190dd90a88eb..a4cb4b642987 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4000,7 +4000,12 @@ static noinline int split_item(struct btrfs_path *path, struct btrfs_disk_key disk_key; leaf = path->nodes[0]; - BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)); + /* + * Shouldn't happen because the caller must have previously called + * setup_leaf_for_split() to make room for the new item in the leaf. + */ + if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item))) + return -ENOSPC; orig_slot = path->slots[0]; orig_offset = btrfs_item_offset(leaf, path->slots[0]); -- cgit v1.2.3