diff options
Diffstat (limited to 'fs/btrfs')
119 files changed, 12921 insertions, 9270 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index fa8515598341..ea95c90c8474 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -3,9 +3,9 @@ config BTRFS_FS tristate "Btrfs filesystem support" select BLK_CGROUP_PUNT_BIO + select CRC32 select CRYPTO select CRYPTO_CRC32C - select LIBCRC32C select CRYPTO_XXHASH select CRYPTO_SHA256 select CRYPTO_BLAKE2B @@ -52,10 +52,10 @@ config BTRFS_FS_RUN_SANITY_TESTS bool "Btrfs will run sanity tests upon loading" depends on BTRFS_FS help - This will run some basic sanity tests on the free space cache - code to make sure it is acting as it should. These are mostly - regression tests and are only really interesting to btrfs - developers. + This will run sanity tests for core functionality like free space, + extent maps, extent io, extent buffers, inodes, qgroups and others, + at module load time. These are mostly regression tests and are only + interesting to developers. If unsure, say N. @@ -63,9 +63,12 @@ config BTRFS_DEBUG bool "Btrfs debugging support" depends on BTRFS_FS help - Enable run-time debugging support for the btrfs filesystem. This may - enable additional and expensive checks with negative impact on - performance, or export extra information via sysfs. + Enable run-time debugging support for the btrfs filesystem. + + Additional potentially expensive checks, debugging functionality or + sysfs exported information is enabled, like leak checks of internal + objects, optional forced space fragmentation and /sys/fs/btrfs/debug . + This has negative impact on performance. If unsure, say N. @@ -73,8 +76,10 @@ config BTRFS_ASSERT bool "Btrfs assert support" depends on BTRFS_FS help - Enable run-time assertion checking. This will result in panics if - any of the assertions trip. This is meant for btrfs developers only. + Enable run-time assertion checking. Additional safety checks are + done, simple enough not to affect performance but verify invariants + and assumptions of code to run properly. This may result in panics, + and is meant for developers but can be enabled in general. If unsure, say N. @@ -89,7 +94,14 @@ config BTRFS_EXPERIMENTAL Current list: - - extent map shrinker - performance problems with too frequent shrinks + - COW fixup worker warning - last warning before removing the + functionality catching out-of-band page + dirtying, not necessary since 5.8 + + - RAID mirror read policy - additional read policies for balancing + reading from redundant block group + profiles (currently: pid, round-robin, + fixed devid) - send stream protocol v3 - fs-verity support @@ -102,6 +114,8 @@ config BTRFS_EXPERIMENTAL - extent tree v2 - complex rework of extent tracking + - large folio support + If unsure, say N. config BTRFS_FS_REF_VERIFY diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 3cfc440c636c..2d5f0482678b 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -44,4 +44,4 @@ btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \ tests/free-space-tree-tests.o tests/extent-map-tests.o \ - tests/raid-stripe-tree-tests.o + tests/raid-stripe-tree-tests.o tests/delayed-refs-tests.o diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c index e3716516ca38..861c7d92c437 100644 --- a/fs/btrfs/accessors.c +++ b/fs/btrfs/accessors.c @@ -9,27 +9,24 @@ #include "fs.h" #include "accessors.h" -static bool check_setget_bounds(const struct extent_buffer *eb, - const void *ptr, unsigned off, int size) +static void __cold report_setget_bounds(const struct extent_buffer *eb, + const void *ptr, unsigned off, int size) { - const unsigned long member_offset = (unsigned long)ptr + off; + unsigned long member_offset = (unsigned long)ptr + off; - if (unlikely(member_offset + size > eb->len)) { - btrfs_warn(eb->fs_info, - "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d", - (member_offset > eb->len ? "start" : "end"), - (unsigned long)ptr, eb->start, member_offset, size); - return false; - } - - return true; + btrfs_warn(eb->fs_info, + "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d", + (member_offset > eb->len ? "start" : "end"), + (unsigned long)ptr, eb->start, member_offset, size); } -void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb) +/* Copy bytes from @src1 and @src2 to @dest. */ +static __always_inline void memcpy_split_src(char *dest, const char *src1, + const char *src2, const size_t len1, + const size_t total) { - token->eb = eb; - token->kaddr = folio_address(eb->folios[0]); - token->offset = 0; + memcpy(dest, src1, len1); + memcpy(dest + len1, src2, total - len1); } /* @@ -41,11 +38,6 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e * - btrfs_set_8 (for 8/16/32/64) * - btrfs_get_8 (for 8/16/32/64) * - * Generic helpers with a token (cached address of the most recently accessed - * page): - * - btrfs_set_token_8 (for 8/16/32/64) - * - btrfs_get_token_8 (for 8/16/32/64) - * * The set/get functions handle data spanning two pages transparently, in case * metadata block size is larger than page. Every pointer to metadata items is * an offset into the extent buffer page array, cast to a specific type. This @@ -57,118 +49,66 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e */ #define DEFINE_BTRFS_SETGET_BITS(bits) \ -u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ - const void *ptr, unsigned long off) \ -{ \ - const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \ - const unsigned long oil = get_eb_offset_in_folio(token->eb, \ - member_offset);\ - const int unit_size = token->eb->folio_size; \ - const int unit_shift = token->eb->folio_shift; \ - const int size = sizeof(u##bits); \ - u8 lebytes[sizeof(u##bits)]; \ - const int part = unit_size - oil; \ - \ - ASSERT(token); \ - ASSERT(token->kaddr); \ - ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \ - if (token->offset <= member_offset && \ - member_offset + size <= token->offset + unit_size) { \ - return get_unaligned_le##bits(token->kaddr + oil); \ - } \ - token->kaddr = folio_address(token->eb->folios[idx]); \ - token->offset = idx << unit_shift; \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \ - return get_unaligned_le##bits(token->kaddr + oil); \ - \ - memcpy(lebytes, token->kaddr + oil, part); \ - token->kaddr = folio_address(token->eb->folios[idx + 1]); \ - token->offset = (idx + 1) << unit_shift; \ - memcpy(lebytes + part, token->kaddr, size - part); \ - return get_unaligned_le##bits(lebytes); \ -} \ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ const void *ptr, unsigned long off) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ const unsigned long idx = get_eb_folio_index(eb, member_offset);\ - const unsigned long oil = get_eb_offset_in_folio(eb, \ - member_offset);\ - const int unit_size = eb->folio_size; \ - char *kaddr = folio_address(eb->folios[idx]); \ - const int size = sizeof(u##bits); \ - const int part = unit_size - oil; \ - u8 lebytes[sizeof(u##bits)]; \ - \ - ASSERT(check_setget_bounds(eb, ptr, off, size)); \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \ - return get_unaligned_le##bits(kaddr + oil); \ - \ - memcpy(lebytes, kaddr + oil, part); \ - kaddr = folio_address(eb->folios[idx + 1]); \ - memcpy(lebytes + part, kaddr, size - part); \ - return get_unaligned_le##bits(lebytes); \ -} \ -void btrfs_set_token_##bits(struct btrfs_map_token *token, \ - const void *ptr, unsigned long off, \ - u##bits val) \ -{ \ - const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \ - const unsigned long oil = get_eb_offset_in_folio(token->eb, \ + const unsigned long oif = get_eb_offset_in_folio(eb, \ member_offset);\ - const int unit_size = token->eb->folio_size; \ - const int unit_shift = token->eb->folio_shift; \ - const int size = sizeof(u##bits); \ + char *kaddr = folio_address(eb->folios[idx]) + oif; \ + const int part = eb->folio_size - oif; \ u8 lebytes[sizeof(u##bits)]; \ - const int part = unit_size - oil; \ \ - ASSERT(token); \ - ASSERT(token->kaddr); \ - ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \ - if (token->offset <= member_offset && \ - member_offset + size <= token->offset + unit_size) { \ - put_unaligned_le##bits(val, token->kaddr + oil); \ - return; \ + if (unlikely(member_offset + sizeof(u##bits) > eb->len)) { \ + report_setget_bounds(eb, ptr, off, sizeof(u##bits)); \ + return 0; \ } \ - token->kaddr = folio_address(token->eb->folios[idx]); \ - token->offset = idx << unit_shift; \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || \ - oil + size <= unit_size) { \ - put_unaligned_le##bits(val, token->kaddr + oil); \ - return; \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || sizeof(u##bits) == 1 || \ + likely(sizeof(u##bits) <= part)) \ + return get_unaligned_le##bits(kaddr); \ + \ + if (sizeof(u##bits) == 2) { \ + lebytes[0] = *kaddr; \ + kaddr = folio_address(eb->folios[idx + 1]); \ + lebytes[1] = *kaddr; \ + } else { \ + memcpy_split_src(lebytes, kaddr, \ + folio_address(eb->folios[idx + 1]), \ + part, sizeof(u##bits)); \ } \ - put_unaligned_le##bits(val, lebytes); \ - memcpy(token->kaddr + oil, lebytes, part); \ - token->kaddr = folio_address(token->eb->folios[idx + 1]); \ - token->offset = (idx + 1) << unit_shift; \ - memcpy(token->kaddr, lebytes + part, size - part); \ + return get_unaligned_le##bits(lebytes); \ } \ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ unsigned long off, u##bits val) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ const unsigned long idx = get_eb_folio_index(eb, member_offset);\ - const unsigned long oil = get_eb_offset_in_folio(eb, \ + const unsigned long oif = get_eb_offset_in_folio(eb, \ member_offset);\ - const int unit_size = eb->folio_size; \ - char *kaddr = folio_address(eb->folios[idx]); \ - const int size = sizeof(u##bits); \ - const int part = unit_size - oil; \ + char *kaddr = folio_address(eb->folios[idx]) + oif; \ + const int part = eb->folio_size - oif; \ u8 lebytes[sizeof(u##bits)]; \ \ - ASSERT(check_setget_bounds(eb, ptr, off, size)); \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || \ - oil + size <= unit_size) { \ - put_unaligned_le##bits(val, kaddr + oil); \ + if (unlikely(member_offset + sizeof(u##bits) > eb->len)) { \ + report_setget_bounds(eb, ptr, off, sizeof(u##bits)); \ + return; \ + } \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || sizeof(u##bits) == 1 || \ + likely(sizeof(u##bits) <= part)) { \ + put_unaligned_le##bits(val, kaddr); \ return; \ } \ - \ put_unaligned_le##bits(val, lebytes); \ - memcpy(kaddr + oil, lebytes, part); \ - kaddr = folio_address(eb->folios[idx + 1]); \ - memcpy(kaddr, lebytes + part, size - part); \ + if (sizeof(u##bits) == 2) { \ + *kaddr = lebytes[0]; \ + kaddr = folio_address(eb->folios[idx + 1]); \ + *kaddr = lebytes[1]; \ + } else { \ + memcpy(kaddr, lebytes, part); \ + kaddr = folio_address(eb->folios[idx + 1]); \ + memcpy(kaddr, lebytes + part, sizeof(u##bits) - part); \ + } \ } DEFINE_BTRFS_SETGET_BITS(8) diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 7a7e0ef69973..99b3ced12805 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -12,17 +12,10 @@ #include <linux/string.h> #include <linux/mm.h> #include <uapi/linux/btrfs_tree.h> +#include "extent_io.h" struct extent_buffer; -struct btrfs_map_token { - struct extent_buffer *eb; - char *kaddr; - unsigned long offset; -}; - -void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb); - /* * Some macros to generate set/get functions for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple one @@ -55,11 +48,6 @@ static inline void put_unaligned_le8(u8 val, void *p) sizeof_field(type, member))) #define DECLARE_BTRFS_SETGET_BITS(bits) \ -u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ - const void *ptr, unsigned long off); \ -void btrfs_set_token_##bits(struct btrfs_map_token *token, \ - const void *ptr, unsigned long off, \ - u##bits val); \ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ const void *ptr, unsigned long off); \ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ @@ -82,18 +70,6 @@ static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ { \ static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ btrfs_set_##bits(eb, s, offsetof(type, member), val); \ -} \ -static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ - const type *s) \ -{ \ - static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ - return btrfs_get_token_##bits(token, s, offsetof(type, member));\ -} \ -static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ - type *s, u##bits val) \ -{ \ - static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ - btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ } #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ @@ -478,18 +454,6 @@ static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \ int slot, u32 val) \ { \ btrfs_set_raw_item_##member(eb, btrfs_item_nr(eb, slot), val); \ -} \ -static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \ - int slot) \ -{ \ - struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \ - return btrfs_token_raw_item_##member(token, item); \ -} \ -static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \ - int slot, u32 val) \ -{ \ - struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \ - btrfs_set_token_raw_item_##member(token, item, val); \ } BTRFS_ITEM_SETGET_FUNCS(offset) diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h index 48b9ddae4a46..0458cd51ed48 100644 --- a/fs/btrfs/acl.h +++ b/fs/btrfs/acl.h @@ -3,6 +3,8 @@ #ifndef BTRFS_ACL_H #define BTRFS_ACL_H +#include <linux/types.h> + struct posix_acl; struct inode; struct btrfs_trans_handle; diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 361a866c1995..6c6f3bb58f4e 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -18,7 +18,7 @@ enum { }; #define NO_THRESHOLD (-1) -#define DFT_THRESHOLD (32) +#define DEFAULT_THRESHOLD (32) struct btrfs_workqueue { struct workqueue_struct *normal_wq; @@ -94,9 +94,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, ret->limit_active = limit_active; if (thresh == 0) - thresh = DFT_THRESHOLD; + thresh = DEFAULT_THRESHOLD; /* For low threshold, disabling threshold is a better choice */ - if (thresh < DFT_THRESHOLD) { + if (thresh < DEFAULT_THRESHOLD) { ret->current_active = limit_active; ret->thresh = NO_THRESHOLD; } else { @@ -168,7 +168,7 @@ static inline void thresh_exec_hook(struct btrfs_workqueue *wq) { int new_current_active; long pending; - int need_change = 0; + bool need_change = false; if (wq->thresh == NO_THRESHOLD) return; @@ -196,15 +196,14 @@ static inline void thresh_exec_hook(struct btrfs_workqueue *wq) new_current_active--; new_current_active = clamp_val(new_current_active, 1, wq->limit_active); if (new_current_active != wq->current_active) { - need_change = 1; + need_change = true; wq->current_active = new_current_active; } out: spin_unlock(&wq->thres_lock); - if (need_change) { + if (need_change) workqueue_set_max_active(wq->normal_wq, wq->current_active); - } } static void run_ordered_work(struct btrfs_workqueue *wq, @@ -220,8 +219,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq, spin_lock_irqsave(lock, flags); if (list_empty(list)) break; - work = list_entry(list->next, struct btrfs_work, - ordered_list); + work = list_first_entry(list, struct btrfs_work, ordered_list); if (!test_bit(WORK_DONE_BIT, &work->flags)) break; /* @@ -296,7 +294,7 @@ static void btrfs_work_helper(struct work_struct *normal_work) struct btrfs_work *work = container_of(normal_work, struct btrfs_work, normal_work); struct btrfs_workqueue *wq = work->wq; - int need_order = 0; + bool need_order = false; /* * We should not touch things inside work in the following cases: @@ -307,7 +305,7 @@ static void btrfs_work_helper(struct work_struct *normal_work) * So we save the needed things here. */ if (work->ordered_func) - need_order = 1; + need_order = true; trace_btrfs_work_sched(work); thresh_exec_hook(wq); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 04f53ca548e1..6a450be293b1 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -250,6 +250,21 @@ static int prelim_ref_compare(const struct prelim_ref *ref1, return 0; } +static int prelim_ref_rb_add_cmp(const struct rb_node *new, + const struct rb_node *exist) +{ + const struct prelim_ref *ref_new = + rb_entry(new, struct prelim_ref, rbnode); + const struct prelim_ref *ref_exist = + rb_entry(exist, struct prelim_ref, rbnode); + + /* + * prelim_ref_compare() expects the first parameter as the existing one, + * different from the rb_find_add_cached() order. + */ + return prelim_ref_compare(ref_exist, ref_new); +} + static void update_share_count(struct share_check *sc, int oldcount, int newcount, const struct prelim_ref *newref) { @@ -278,55 +293,39 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info, struct share_check *sc) { struct rb_root_cached *root; - struct rb_node **p; - struct rb_node *parent = NULL; - struct prelim_ref *ref; - int result; - bool leftmost = true; + struct rb_node *exist; root = &preftree->root; - p = &root->rb_root.rb_node; + exist = rb_find_add_cached(&newref->rbnode, root, prelim_ref_rb_add_cmp); + if (exist) { + struct prelim_ref *ref = rb_entry(exist, struct prelim_ref, rbnode); + /* Identical refs, merge them and free @newref */ + struct extent_inode_elem *eie = ref->inode_list; - while (*p) { - parent = *p; - ref = rb_entry(parent, struct prelim_ref, rbnode); - result = prelim_ref_compare(ref, newref); - if (result < 0) { - p = &(*p)->rb_left; - } else if (result > 0) { - p = &(*p)->rb_right; - leftmost = false; - } else { - /* Identical refs, merge them and free @newref */ - struct extent_inode_elem *eie = ref->inode_list; + while (eie && eie->next) + eie = eie->next; - while (eie && eie->next) - eie = eie->next; - - if (!eie) - ref->inode_list = newref->inode_list; - else - eie->next = newref->inode_list; - trace_btrfs_prelim_ref_merge(fs_info, ref, newref, - preftree->count); - /* - * A delayed ref can have newref->count < 0. - * The ref->count is updated to follow any - * BTRFS_[ADD|DROP]_DELAYED_REF actions. - */ - update_share_count(sc, ref->count, - ref->count + newref->count, newref); - ref->count += newref->count; - free_pref(newref); - return; - } + if (!eie) + ref->inode_list = newref->inode_list; + else + eie->next = newref->inode_list; + trace_btrfs_prelim_ref_merge(fs_info, ref, newref, + preftree->count); + /* + * A delayed ref can have newref->count < 0. + * The ref->count is updated to follow any + * BTRFS_[ADD|DROP]_DELAYED_REF actions. + */ + update_share_count(sc, ref->count, + ref->count + newref->count, newref); + ref->count += newref->count; + free_pref(newref); + return; } update_share_count(sc, 0, newref->count, newref); preftree->count++; trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count); - rb_link_node(&newref->rbnode, parent, p); - rb_insert_color_cached(&newref->rbnode, root, leftmost); } /* @@ -734,7 +733,6 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx, struct preftrees *preftrees, struct share_check *sc) { - int err; int ret = 0; struct ulist *parents; struct ulist_node *node; @@ -753,6 +751,7 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx, */ while ((rnode = rb_first_cached(&preftrees->indirect.root))) { struct prelim_ref *ref; + int ret2; ref = rb_entry(rnode, struct prelim_ref, rbnode); if (WARN(ref->parent, @@ -774,18 +773,18 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx, ret = BACKREF_FOUND_SHARED; goto out; } - err = resolve_indirect_ref(ctx, path, preftrees, ref, parents); + ret2 = resolve_indirect_ref(ctx, path, preftrees, ref, parents); /* * we can only tolerate ENOENT,otherwise,we should catch error * and return directly. */ - if (err == -ENOENT) { + if (ret2 == -ENOENT) { prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref, NULL); continue; - } else if (err) { + } else if (ret2) { free_pref(ref); - ret = err; + ret = ret2; goto out; } @@ -1400,11 +1399,11 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, ASSERT(ctx->roots == NULL); key.objectid = ctx->bytenr; - key.offset = (u64)-1; if (btrfs_fs_incompat(ctx->fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; path = btrfs_alloc_path(); if (!path) @@ -2202,16 +2201,15 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, int ret; u64 flags; u64 size = 0; - u32 item_size; const struct extent_buffer *eb; struct btrfs_extent_item *ei; struct btrfs_key key; + key.objectid = logical; if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; - key.objectid = logical; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); @@ -2245,7 +2243,6 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, } eb = path->nodes[0]; - item_size = btrfs_item_size(eb, path->slots[0]); ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); flags = btrfs_extent_flags(eb, ei); @@ -2253,7 +2250,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, btrfs_debug(fs_info, "logical %llu is at position %llu within the extent (%llu EXTENT_ITEM %llu) flags %#llx size %u", logical, logical - found_key->objectid, found_key->objectid, - found_key->offset, flags, item_size); + found_key->offset, flags, btrfs_item_size(eb, path->slots[0])); WARN_ON(!flags_ret); if (flags_ret) { @@ -2549,17 +2546,20 @@ static int build_ino_list(u64 inum, u64 offset, u64 num_bytes, u64 root, void *c } int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, - struct btrfs_path *path, void *ctx, bool ignore_offset) { struct btrfs_backref_walk_ctx walk_ctx = { 0 }; int ret; u64 flags = 0; struct btrfs_key found_key; - int search_commit_root = path->search_commit_root; + struct btrfs_path *path; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; ret = extent_from_logical(fs_info, logical, path, &found_key, &flags); - btrfs_release_path(path); + btrfs_free_path(path); if (ret < 0) return ret; if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) @@ -2572,8 +2572,7 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, walk_ctx.extent_item_pos = logical - found_key.objectid; walk_ctx.fs_info = fs_info; - return iterate_extent_inodes(&walk_ctx, search_commit_root, - build_ino_list, ctx); + return iterate_extent_inodes(&walk_ctx, false, build_ino_list, ctx); } static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, @@ -2878,7 +2877,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) goto release; } if (path->slots[0] == 0) { - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN(); ret = -EUCLEAN; goto release; } @@ -3022,9 +3021,6 @@ void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info, cache->rb_root = RB_ROOT; for (i = 0; i < BTRFS_MAX_LEVEL; i++) INIT_LIST_HEAD(&cache->pending[i]); - INIT_LIST_HEAD(&cache->changed); - INIT_LIST_HEAD(&cache->detached); - INIT_LIST_HEAD(&cache->leaves); INIT_LIST_HEAD(&cache->pending_edge); INIT_LIST_HEAD(&cache->useless_node); cache->fs_info = fs_info; @@ -3132,29 +3128,17 @@ void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, struct btrfs_backref_node *node) { - struct btrfs_backref_node *upper; struct btrfs_backref_edge *edge; if (!node) return; - BUG_ON(!node->lowest && !node->detached); while (!list_empty(&node->upper)) { - edge = list_entry(node->upper.next, struct btrfs_backref_edge, - list[LOWER]); - upper = edge->node[UPPER]; + edge = list_first_entry(&node->upper, struct btrfs_backref_edge, + list[LOWER]); list_del(&edge->list[LOWER]); list_del(&edge->list[UPPER]); btrfs_backref_free_edge(cache, edge); - - /* - * Add the node to leaf node list if no other child block - * cached. - */ - if (list_empty(&upper->lower)) { - list_add_tail(&upper->lower, &cache->leaves); - upper->lowest = 1; - } } btrfs_backref_drop_node(cache, node); @@ -3166,49 +3150,25 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, void btrfs_backref_release_cache(struct btrfs_backref_cache *cache) { struct btrfs_backref_node *node; - int i; - - while (!list_empty(&cache->detached)) { - node = list_entry(cache->detached.next, - struct btrfs_backref_node, list); - btrfs_backref_cleanup_node(cache, node); - } - while (!list_empty(&cache->leaves)) { - node = list_entry(cache->leaves.next, - struct btrfs_backref_node, lower); + while ((node = rb_entry_safe(rb_first(&cache->rb_root), + struct btrfs_backref_node, rb_node))) btrfs_backref_cleanup_node(cache, node); - } - for (i = 0; i < BTRFS_MAX_LEVEL; i++) { - while (!list_empty(&cache->pending[i])) { - node = list_first_entry(&cache->pending[i], - struct btrfs_backref_node, - list); - btrfs_backref_cleanup_node(cache, node); - } - } ASSERT(list_empty(&cache->pending_edge)); ASSERT(list_empty(&cache->useless_node)); - ASSERT(list_empty(&cache->changed)); - ASSERT(list_empty(&cache->detached)); - ASSERT(RB_EMPTY_ROOT(&cache->rb_root)); ASSERT(!cache->nr_nodes); ASSERT(!cache->nr_edges); } -void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, - struct btrfs_backref_node *lower, - struct btrfs_backref_node *upper, - int link_which) +static void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, + struct btrfs_backref_node *lower, + struct btrfs_backref_node *upper) { ASSERT(upper && lower && upper->level == lower->level + 1); edge->node[LOWER] = lower; edge->node[UPPER] = upper; - if (link_which & LINK_LOWER) - list_add_tail(&edge->list[LOWER], &lower->upper); - if (link_which & LINK_UPPER) - list_add_tail(&edge->list[UPPER], &upper->lower); + list_add_tail(&edge->list[LOWER], &lower->upper); } /* * Handle direct tree backref @@ -3278,7 +3238,7 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache, ASSERT(upper->checked); INIT_LIST_HEAD(&edge->list[UPPER]); } - btrfs_backref_link_edge(edge, cur, upper, LINK_LOWER); + btrfs_backref_link_edge(edge, cur, upper); return 0; } @@ -3316,8 +3276,12 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, root = btrfs_get_fs_root(fs_info, ref_key->offset, false); if (IS_ERR(root)) return PTR_ERR(root); - if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - cur->cowonly = 1; + + /* We shouldn't be using backref cache for non-shareable roots. */ + if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) { + btrfs_put_root(root); + return -EUCLEAN; + } if (btrfs_root_level(&root->root_item) == cur->level) { /* Tree root */ @@ -3403,8 +3367,15 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, goto out; } upper->owner = btrfs_header_owner(eb); - if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - upper->cowonly = 1; + + /* We shouldn't be using backref cache for non shareable roots. */ + if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) { + btrfs_put_root(root); + btrfs_backref_free_edge(cache, edge); + btrfs_backref_free_node(cache, upper); + ret = -EUCLEAN; + goto out; + } /* * If we know the block isn't shared we can avoid @@ -3437,7 +3408,7 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, if (!upper->owner) upper->owner = btrfs_header_owner(eb); } - btrfs_backref_link_edge(edge, lower, upper, LINK_LOWER); + btrfs_backref_link_edge(edge, lower, upper); if (rb_node) { btrfs_put_root(root); @@ -3498,8 +3469,8 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, * type BTRFS_TREE_BLOCK_REF_KEY */ ASSERT(list_is_singular(&cur->upper)); - edge = list_entry(cur->upper.next, struct btrfs_backref_edge, - list[LOWER]); + edge = list_first_entry(&cur->upper, struct btrfs_backref_edge, + list[LOWER]); ASSERT(list_empty(&edge->list[UPPER])); exist = edge->node[UPPER]; /* @@ -3595,15 +3566,9 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, ASSERT(start->checked); - /* Insert this node to cache if it's not COW-only */ - if (!start->cowonly) { - rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, - &start->rb_node); - if (rb_node) - btrfs_backref_panic(cache->fs_info, start->bytenr, - -EEXIST); - list_add_tail(&start->lower, &cache->leaves); - } + rb_node = rb_simple_insert(&cache->rb_root, &start->simple_node); + if (rb_node) + btrfs_backref_panic(cache->fs_info, start->bytenr, -EEXIST); /* * Use breadth first search to iterate all related edges. @@ -3642,38 +3607,22 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, * parents have already been linked. */ if (!RB_EMPTY_NODE(&upper->rb_node)) { - if (upper->lowest) { - list_del_init(&upper->lower); - upper->lowest = 0; - } - list_add_tail(&edge->list[UPPER], &upper->lower); continue; } /* Sanity check, we shouldn't have any unchecked nodes */ if (!upper->checked) { - ASSERT(0); + DEBUG_WARN("we should not have any unchecked nodes"); return -EUCLEAN; } - /* Sanity check, COW-only node has non-COW-only parent */ - if (start->cowonly != upper->cowonly) { - ASSERT(0); + rb_node = rb_simple_insert(&cache->rb_root, &upper->simple_node); + if (unlikely(rb_node)) { + btrfs_backref_panic(cache->fs_info, upper->bytenr, -EEXIST); return -EUCLEAN; } - /* Only cache non-COW-only (subvolume trees) tree blocks */ - if (!upper->cowonly) { - rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr, - &upper->rb_node); - if (rb_node) { - btrfs_backref_panic(cache->fs_info, - upper->bytenr, -EEXIST); - return -EUCLEAN; - } - } - list_add_tail(&edge->list[UPPER], &upper->lower); /* diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index e8c22cccb5c1..34b0193a181c 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -226,8 +226,7 @@ int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, iterate_extent_inodes_t *iterate, void *user_ctx); int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, - struct btrfs_path *path, void *ctx, - bool ignore_offset); + void *ctx, bool ignore_offset); int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); @@ -313,11 +312,22 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter); * Represent a tree block in the backref cache */ struct btrfs_backref_node { - struct { - struct rb_node rb_node; - u64 bytenr; - }; /* Use rb_simple_node for search/insert */ + union{ + /* Use rb_simple_node for search/insert */ + struct { + struct rb_node rb_node; + u64 bytenr; + }; + + struct rb_simple_node simple_node; + }; + /* + * This is a sanity check, whenever we COW a block we will update + * new_bytenr with it's current location, and we will check this in + * various places to validate that the cache makes sense, it shouldn't + * be used for anything else. + */ u64 new_bytenr; /* Objectid of tree block owner, can be not uptodate */ u64 owner; @@ -335,10 +345,6 @@ struct btrfs_backref_node { struct extent_buffer *eb; /* Level of the tree block */ unsigned int level:8; - /* Is the block in a non-shareable tree */ - unsigned int cowonly:1; - /* 1 if no child node is in the cache */ - unsigned int lowest:1; /* Is the extent buffer locked */ unsigned int locked:1; /* Has the block been processed */ @@ -391,12 +397,6 @@ struct btrfs_backref_cache { * level blocks may not reflect the new location */ struct list_head pending[BTRFS_MAX_LEVEL]; - /* List of backref nodes with no child node */ - struct list_head leaves; - /* List of blocks that have been COWed in current transaction */ - struct list_head changed; - /* List of detached backref node. */ - struct list_head detached; u64 last_trans; @@ -427,13 +427,6 @@ struct btrfs_backref_node *btrfs_backref_alloc_node( struct btrfs_backref_edge *btrfs_backref_alloc_edge( struct btrfs_backref_cache *cache); -#define LINK_LOWER (1 << 0) -#define LINK_UPPER (1 << 1) - -void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, - struct btrfs_backref_node *lower, - struct btrfs_backref_node *upper, - int link_which); void btrfs_backref_free_node(struct btrfs_backref_cache *cache, struct btrfs_backref_node *node); void btrfs_backref_free_edge(struct btrfs_backref_cache *cache, diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 1f216d07eff6..50b5fc1c06d7 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -27,12 +27,12 @@ struct btrfs_failed_bio { }; /* Is this a data path I/O that needs storage layer checksum and repair? */ -static inline bool is_data_bbio(struct btrfs_bio *bbio) +static inline bool is_data_bbio(const struct btrfs_bio *bbio) { return bbio->inode && is_data_inode(bbio->inode); } -static bool bbio_has_ordered_extent(struct btrfs_bio *bbio) +static bool bbio_has_ordered_extent(const struct btrfs_bio *bbio) { return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE; } @@ -81,6 +81,9 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS, &btrfs_clone_bioset); + if (IS_ERR(bio)) + return ERR_CAST(bio); + bbio = btrfs_bio(bio); btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); bbio->inode = orig_bbio->inode; @@ -94,33 +97,17 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, return bbio; } -/* Free a bio that was never submitted to the underlying device. */ -static void btrfs_cleanup_bio(struct btrfs_bio *bbio) -{ - if (bbio_has_ordered_extent(bbio)) - btrfs_put_ordered_extent(bbio->ordered); - bio_put(&bbio->bio); -} - -static void __btrfs_bio_end_io(struct btrfs_bio *bbio) -{ - if (bbio_has_ordered_extent(bbio)) { - struct btrfs_ordered_extent *ordered = bbio->ordered; - - bbio->end_io(bbio); - btrfs_put_ordered_extent(ordered); - } else { - bbio->end_io(bbio); - } -} - void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { bbio->bio.bi_status = status; if (bbio->bio.bi_pool == &btrfs_clone_bioset) { struct btrfs_bio *orig_bbio = bbio->private; - btrfs_cleanup_bio(bbio); + /* Free bio that was never submitted to the underlying device. */ + if (bbio_has_ordered_extent(bbio)) + btrfs_put_ordered_extent(bbio->ordered); + bio_put(&bbio->bio); + bbio = orig_bbio; } @@ -135,18 +122,26 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) /* Load split bio's error which might be set above. */ if (status == BLK_STS_OK) bbio->bio.bi_status = READ_ONCE(bbio->status); - __btrfs_bio_end_io(bbio); + + if (bbio_has_ordered_extent(bbio)) { + struct btrfs_ordered_extent *ordered = bbio->ordered; + + bbio->end_io(bbio); + btrfs_put_ordered_extent(ordered); + } else { + bbio->end_io(bbio); + } } } -static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) +static int next_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror) { if (cur_mirror == fbio->num_copies) return cur_mirror + 1 - fbio->num_copies; return cur_mirror + 1; } -static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) +static int prev_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror) { if (cur_mirror == 1) return fbio->num_copies; @@ -170,12 +165,6 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); int mirror = repair_bbio->mirror_num; - /* - * We can only trigger this for data bio, which doesn't support larger - * folios yet. - */ - ASSERT(folio_order(page_folio(bv->bv_page)) == 0); - if (repair_bbio->bio.bi_status || !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); @@ -197,7 +186,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, btrfs_repair_io_failure(fs_info, btrfs_ino(inode), repair_bbio->file_offset, fs_info->sectorsize, repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, - page_folio(bv->bv_page), bv->bv_offset, mirror); + bvec_phys(bv), mirror); } while (mirror != fbio->bbio->mirror_num); done: @@ -306,7 +295,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de btrfs_bio_end_io(bbio, bbio->bio.bi_status); } -static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) +static void btrfs_log_dev_io_error(const struct bio *bio, struct btrfs_device *dev) { if (!dev || !dev->bdev) return; @@ -321,8 +310,8 @@ static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); } -static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, - struct bio *bio) +static struct workqueue_struct *btrfs_end_io_wq(const struct btrfs_fs_info *fs_info, + const struct bio *bio) { if (bio->bi_opf & REQ_META) return fs_info->endio_meta_workers; @@ -355,7 +344,7 @@ static void btrfs_simple_end_io(struct bio *bio) INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } else { - if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + if (bio_is_zone_append(bio) && !bio->bi_status) btrfs_record_physical_zoned(bbio); btrfs_bio_end_io(bbio, bbio->bio.bi_status); } @@ -398,7 +387,7 @@ static void btrfs_orig_write_end_io(struct bio *bio) else bio->bi_status = BLK_STS_OK; - if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + if (bio_is_zone_append(bio) && !bio->bi_status) stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; btrfs_bio_end_io(bbio, bbio->bio.bi_status); @@ -412,7 +401,7 @@ static void btrfs_clone_write_end_io(struct bio *bio) if (bio->bi_status) { atomic_inc(&stripe->bioc->error); btrfs_log_dev_io_error(bio, stripe->dev); - } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + } else if (bio_is_zone_append(bio)) { stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; } @@ -444,12 +433,20 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) ASSERT(btrfs_dev_is_sequential(dev, physical)); bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; } - btrfs_debug_in_rcu(dev->fs_info, + btrfs_debug(dev->fs_info, "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), dev->devid, bio->bi_iter.bi_size); + /* + * Track reads if tracking is enabled; ignore I/O operations before the + * filesystem is fully initialized. + */ + if (dev->fs_devices->collect_fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info) + percpu_counter_add(&dev->fs_info->stats_read_blocks, + bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits); + if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) blkcg_punt_bio_submit(bio); else @@ -509,7 +506,7 @@ static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, } } -static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio) +static int btrfs_bio_csum(struct btrfs_bio *bbio) { if (bbio->bio.bi_opf & REQ_META) return btree_csum_one_bio(bbio); @@ -540,11 +537,11 @@ static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async = container_of(work, struct async_submit_bio, work); - blk_status_t ret; + int ret; ret = btrfs_bio_csum(async->bbio); if (ret) - async->bbio->bio.bi_status = ret; + async->bbio->bio.bi_status = errno_to_blk_status(ret); } /* @@ -570,7 +567,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free) /* If an error occurred we just want to clean up the bio and move on. */ if (bio->bi_status) { - btrfs_bio_end_io(async->bbio, async->bbio->bio.bi_status); + btrfs_bio_end_io(async->bbio, bio->bi_status); return; } @@ -649,8 +646,14 @@ static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) map_length = min(map_length, bbio->fs_info->max_zone_append_size); sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits, &nr_segs, map_length); - if (sector_offset) - return sector_offset << SECTOR_SHIFT; + if (sector_offset) { + /* + * bio_split_rw_at() could split at a size smaller than our + * sectorsize and thus cause unaligned I/Os. Fix that by + * always rounding down to the nearest boundary. + */ + return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, bbio->fs_info->sectorsize); + } return map_length; } @@ -665,8 +668,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) bool use_append = btrfs_use_zone_append(bbio); struct btrfs_io_context *bioc = NULL; struct btrfs_io_stripe smap; - blk_status_t ret; - int error; + blk_status_t status; + int ret; if (!bbio->inode || btrfs_is_data_reloc_root(inode->root)) smap.rst_search_commit_root = true; @@ -674,11 +677,12 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) smap.rst_search_commit_root = false; btrfs_bio_counter_inc_blocked(fs_info); - error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - &bioc, &smap, &mirror_num); - if (error) { - ret = errno_to_blk_status(error); - goto fail; + ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, + &bioc, &smap, &mirror_num); + if (ret) { + status = errno_to_blk_status(ret); + btrfs_bio_counter_dec(fs_info); + goto end_bbio; } map_length = min(map_length, length); @@ -686,7 +690,15 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) map_length = btrfs_append_map_length(bbio, map_length); if (map_length < length) { - bbio = btrfs_split_bio(fs_info, bbio, map_length); + struct btrfs_bio *split; + + split = btrfs_split_bio(fs_info, bbio, map_length); + if (IS_ERR(split)) { + status = errno_to_blk_status(PTR_ERR(split)); + btrfs_bio_counter_dec(fs_info); + goto end_bbio; + } + bbio = split; bio = &bbio->bio; } @@ -697,7 +709,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) { bbio->saved_iter = bio->bi_iter; ret = btrfs_lookup_bio_sums(bbio); - if (ret) + status = errno_to_blk_status(ret); + if (status) goto fail; } @@ -707,8 +720,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) bio->bi_opf |= REQ_OP_ZONE_APPEND; } - if (is_data_bbio(bbio) && bioc && - btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) { + if (is_data_bbio(bbio) && bioc && bioc->use_rst) { /* * No locking for the list update, as we only add to * the list in the I/O submission path, and list @@ -731,13 +743,15 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) goto done; ret = btrfs_bio_csum(bbio); - if (ret) + status = errno_to_blk_status(ret); + if (status) goto fail; } else if (use_append || (btrfs_is_zoned(fs_info) && inode && inode->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_alloc_dummy_sum(bbio); - if (ret) + status = errno_to_blk_status(ret); + if (status) goto fail; } } @@ -758,9 +772,10 @@ fail: ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset); ASSERT(remaining); - btrfs_bio_end_io(remaining, ret); + btrfs_bio_end_io(remaining, status); } - btrfs_bio_end_io(bbio, ret); +end_bbio: + btrfs_bio_end_io(bbio, status); /* Do not submit another chunk */ return true; } @@ -785,8 +800,7 @@ void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num) * freeing the bio. */ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct folio *folio, - unsigned int folio_offset, int mirror_num) + u64 length, u64 logical, phys_addr_t paddr, int mirror_num) { struct btrfs_io_stripe smap = { 0 }; struct bio_vec bvec; @@ -817,8 +831,7 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; - ret = bio_add_folio(&bio, folio, length, folio_offset); - ASSERT(ret); + __bio_add_page(&bio, phys_to_page(paddr), length, offset_in_page(paddr)); ret = submit_bio_wait(&bio); if (ret) { /* try to remap that extent elsewhere? */ @@ -826,7 +839,7 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, goto out_bio_uninit; } - btrfs_info_rl_in_rcu(fs_info, + btrfs_info_rl(fs_info, "read error corrected: ino %llu off %llu (dev %s sector %llu)", ino, start, btrfs_dev_name(smap.dev), smap.physical >> SECTOR_SHIFT); @@ -882,22 +895,18 @@ int __init btrfs_bioset_init(void) return -ENOMEM; if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE, offsetof(struct btrfs_bio, bio), 0)) - goto out_free_bioset; + goto out; if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE, offsetof(struct btrfs_bio, bio), BIOSET_NEED_BVECS)) - goto out_free_clone_bioset; + goto out; if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE, sizeof(struct btrfs_failed_bio))) - goto out_free_repair_bioset; + goto out; return 0; -out_free_repair_bioset: - bioset_exit(&btrfs_repair_bioset); -out_free_clone_bioset: - bioset_exit(&btrfs_clone_bioset); -out_free_bioset: - bioset_exit(&btrfs_bioset); +out: + btrfs_bioset_exit(); return -ENOMEM; } diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index e2fe16074ad6..dc2eb43b7097 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -110,7 +110,6 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num); void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace); int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct folio *folio, - unsigned int folio_offset, int mirror_num); + u64 length, u64 logical, phys_addr_t paddr, int mirror_num); #endif diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 4427c1b835e8..9bf282d2453c 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -34,6 +34,19 @@ int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group } #endif +static inline bool has_unwritten_metadata(struct btrfs_block_group *block_group) +{ + /* The meta_write_pointer is available only on the zoned setup. */ + if (!btrfs_is_zoned(block_group->fs_info)) + return false; + + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) + return false; + + return block_group->start + block_group->alloc_offset > + block_group->meta_write_pointer; +} + /* * Return target flags in extended format or 0 if restripe for this chunk_type * is not in progress @@ -173,43 +186,41 @@ void btrfs_put_block_group(struct btrfs_block_group *cache) } } +static int btrfs_bg_start_cmp(const struct rb_node *new, + const struct rb_node *exist) +{ + const struct btrfs_block_group *new_bg = + rb_entry(new, struct btrfs_block_group, cache_node); + const struct btrfs_block_group *exist_bg = + rb_entry(exist, struct btrfs_block_group, cache_node); + + if (new_bg->start < exist_bg->start) + return -1; + if (new_bg->start > exist_bg->start) + return 1; + return 0; +} + /* * This adds the block group to the fs_info rb tree for the block group cache */ -static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, - struct btrfs_block_group *block_group) +static int btrfs_add_block_group_cache(struct btrfs_block_group *block_group) { - struct rb_node **p; - struct rb_node *parent = NULL; - struct btrfs_block_group *cache; - bool leftmost = true; + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct rb_node *exist; + int ret = 0; ASSERT(block_group->length != 0); - write_lock(&info->block_group_cache_lock); - p = &info->block_group_cache_tree.rb_root.rb_node; - - while (*p) { - parent = *p; - cache = rb_entry(parent, struct btrfs_block_group, cache_node); - if (block_group->start < cache->start) { - p = &(*p)->rb_left; - } else if (block_group->start > cache->start) { - p = &(*p)->rb_right; - leftmost = false; - } else { - write_unlock(&info->block_group_cache_lock); - return -EEXIST; - } - } - - rb_link_node(&block_group->cache_node, parent, p); - rb_insert_color_cached(&block_group->cache_node, - &info->block_group_cache_tree, leftmost); + write_lock(&fs_info->block_group_cache_lock); - write_unlock(&info->block_group_cache_lock); + exist = rb_find_add_cached(&block_group->cache_node, + &fs_info->block_group_cache_tree, btrfs_bg_start_cmp); + if (exist) + ret = -EEXIST; + write_unlock(&fs_info->block_group_cache_lock); - return 0; + return ret; } /* @@ -527,10 +538,9 @@ int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start, *total_added_ret = 0; while (start < end) { - if (!find_first_extent_bit(&info->excluded_extents, start, - &extent_start, &extent_end, - EXTENT_DIRTY | EXTENT_UPTODATE, - NULL)) + if (!btrfs_find_first_extent_bit(&info->excluded_extents, start, + &extent_start, &extent_end, + EXTENT_DIRTY, NULL)) break; if (extent_start <= start) { @@ -586,7 +596,7 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ struct btrfs_root *extent_root; u64 search_offset; u64 search_end = block_group->start + block_group->length; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key search_key; int ret = 0; @@ -628,7 +638,6 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ lockdep_assert_held(&caching_ctl->mutex); lockdep_assert_held_read(&fs_info->commit_root_sem); - btrfs_free_path(path); return ret; } @@ -704,7 +713,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) struct btrfs_block_group *block_group = caching_ctl->block_group; struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *extent_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key key; u64 total_found = 0; @@ -740,8 +749,8 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) path->reada = READA_FORWARD; key.objectid = last; - key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = 0; next: ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); @@ -787,8 +796,8 @@ next: if (key.objectid < last) { key.objectid = last; - key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = 0; btrfs_release_path(path); goto next; } @@ -831,14 +840,13 @@ next: block_group->start + block_group->length, NULL); out: - btrfs_free_path(path); return ret; } static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg) { - clear_extent_bits(&bg->fs_info->excluded_extents, bg->start, - bg->start + bg->length - 1, EXTENT_UPTODATE); + btrfs_clear_extent_bit(&bg->fs_info->excluded_extents, bg->start, + bg->start + bg->length - 1, EXTENT_DIRTY, NULL); } static noinline void caching_thread(struct btrfs_work *work) @@ -882,7 +890,7 @@ static noinline void caching_thread(struct btrfs_work *work) */ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags))) - ret = load_free_space_tree(caching_ctl); + ret = btrfs_load_free_space_tree(caching_ctl); else ret = load_extent_tree_free(caching_ctl); done: @@ -1223,7 +1231,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, block_group->space_info->total_bytes -= block_group->length; block_group->space_info->bytes_readonly -= (block_group->length - block_group->zone_unusable); - btrfs_space_info_update_bytes_zone_unusable(fs_info, block_group->space_info, + btrfs_space_info_update_bytes_zone_unusable(block_group->space_info, -block_group->zone_unusable); block_group->space_info->disk_total -= block_group->length * factor; @@ -1240,7 +1248,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * another task to attempt to create another block group with the same * item key (and failing with -EEXIST and a transaction abort). */ - ret = remove_block_group_free_space(trans, block_group); + ret = btrfs_remove_block_group_free_space(trans, block_group); if (ret) goto out; @@ -1249,6 +1257,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, goto out; spin_lock(&block_group->lock); + /* + * Hitting this WARN means we removed a block group with an unwritten + * region. It will cause "unable to find chunk map for logical" errors. + */ + if (WARN_ON(has_unwritten_metadata(block_group))) + btrfs_warn(fs_info, + "block group %llu is removed before metadata write out", + block_group->start); + set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags); /* @@ -1396,8 +1413,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) if (btrfs_is_zoned(cache->fs_info)) { /* Migrate zone_unusable bytes to readonly */ sinfo->bytes_readonly += cache->zone_unusable; - btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo, - -cache->zone_unusable); + btrfs_space_info_update_bytes_zone_unusable(sinfo, -cache->zone_unusable); cache->zone_unusable = 0; } cache->ro++; @@ -1409,7 +1425,7 @@ out: if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { btrfs_info(cache->fs_info, "unable to make block group %llu ro", cache->start); - btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); + btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, false); } return ret; } @@ -1424,9 +1440,8 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans, int ret; spin_lock(&fs_info->trans_lock); - if (trans->transaction->list.prev != &fs_info->trans_list) { - prev_trans = list_last_entry(&trans->transaction->list, - struct btrfs_transaction, list); + if (!list_is_first(&trans->transaction->list, &fs_info->trans_list)) { + prev_trans = list_prev_entry(trans->transaction, list); refcount_inc(&prev_trans->use_count); } spin_unlock(&fs_info->trans_lock); @@ -1443,14 +1458,14 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans, */ mutex_lock(&fs_info->unused_bg_unpin_mutex); if (prev_trans) { - ret = clear_extent_bits(&prev_trans->pinned_extents, start, end, - EXTENT_DIRTY); + ret = btrfs_clear_extent_bit(&prev_trans->pinned_extents, start, end, + EXTENT_DIRTY, NULL); if (ret) goto out; } - ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end, - EXTENT_DIRTY); + ret = btrfs_clear_extent_bit(&trans->transaction->pinned_extents, start, end, + EXTENT_DIRTY, NULL); out: mutex_unlock(&fs_info->unused_bg_unpin_mutex); if (prev_trans) @@ -1460,6 +1475,32 @@ out: } /* + * Link the block_group to a list via bg_list. + * + * @bg: The block_group to link to the list. + * @list: The list to link it to. + * + * Use this rather than list_add_tail() directly to ensure proper respect + * to locking and refcounting. + * + * Returns: true if the bg was linked with a refcount bump and false otherwise. + */ +static bool btrfs_link_bg_list(struct btrfs_block_group *bg, struct list_head *list) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + bool added = false; + + spin_lock(&fs_info->unused_bgs_lock); + if (list_empty(&bg->bg_list)) { + btrfs_get_block_group(bg); + list_add_tail(&bg->bg_list, list); + added = true; + } + spin_unlock(&fs_info->unused_bgs_lock); + return added; +} + +/* * Process the unused_bgs list and remove any that don't have any allocated * space inside of them. */ @@ -1567,15 +1608,15 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * needing to allocate extents from the block group. */ used = btrfs_space_info_used(space_info, true); - if (space_info->total_bytes - block_group->length < used && - block_group->zone_unusable < block_group->length) { + if ((space_info->total_bytes - block_group->length < used && + block_group->zone_unusable < block_group->length) || + has_unwritten_metadata(block_group)) { /* * Add a reference for the list, compensate for the ref * drop under the "next" label for the * fs_info->unused_bgs list. */ - btrfs_get_block_group(block_group); - list_add_tail(&block_group->bg_list, &retry_list); + btrfs_link_bg_list(block_group, &retry_list); trace_btrfs_skip_unused_block_group(block_group); spin_unlock(&block_group->lock); @@ -1598,8 +1639,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) ret = btrfs_zone_finish(block_group); if (ret < 0) { btrfs_dec_block_group_ro(block_group); - if (ret == -EAGAIN) + if (ret == -EAGAIN) { + btrfs_link_bg_list(block_group, &retry_list); ret = 0; + } goto next; } @@ -1645,8 +1688,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&space_info->lock); spin_lock(&block_group->lock); - btrfs_space_info_update_bytes_pinned(fs_info, space_info, - -block_group->pinned); + btrfs_space_info_update_bytes_pinned(space_info, -block_group->pinned); space_info->bytes_readonly += block_group->pinned; block_group->pinned = 0; @@ -1826,8 +1868,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) */ list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); while (!list_empty(&fs_info->reclaim_bgs)) { - u64 zone_unusable; - u64 reclaimed; + u64 used; + u64 reserved; int ret = 0; bg = list_first_entry(&fs_info->reclaim_bgs, @@ -1891,6 +1933,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) up_write(&space_info->groups_sem); goto next; } + spin_unlock(&bg->lock); spin_unlock(&space_info->lock); @@ -1907,31 +1950,41 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) goto next; } - /* - * Cache the zone_unusable value before turning the block group - * to read only. As soon as the blog group is read only it's - * zone_unusable value gets moved to the block group's read-only - * bytes and isn't available for calculations anymore. - */ - zone_unusable = bg->zone_unusable; ret = inc_block_group_ro(bg, 0); up_write(&space_info->groups_sem); if (ret < 0) goto next; - btrfs_info(fs_info, - "reclaiming chunk %llu with %llu%% used %llu%% unusable", - bg->start, - div64_u64(bg->used * 100, bg->length), - div64_u64(zone_unusable * 100, bg->length)); + /* + * The amount of bytes reclaimed corresponds to the sum of the + * "used" and "reserved" counters. We have set the block group + * to RO above, which prevents reservations from happening but + * we may have existing reservations for which allocation has + * not yet been done - btrfs_update_block_group() was not yet + * called, which is where we will transfer a reserved extent's + * size from the "reserved" counter to the "used" counter - this + * happens when running delayed references. When we relocate the + * chunk below, relocation first flushes dellaloc, waits for + * ordered extent completion (which is where we create delayed + * references for data extents) and commits the current + * transaction (which runs delayed references), and only after + * it does the actual work to move extents out of the block + * group. So the reported amount of reclaimed bytes is + * effectively the sum of the 'used' and 'reserved' counters. + */ + spin_lock(&bg->lock); + used = bg->used; + reserved = bg->reserved; + spin_unlock(&bg->lock); + trace_btrfs_reclaim_block_group(bg); - reclaimed = bg->used; - ret = btrfs_relocate_chunk(fs_info, bg->start); + ret = btrfs_relocate_chunk(fs_info, bg->start, false); if (ret) { btrfs_dec_block_group_ro(bg); btrfs_err(fs_info, "error relocating chunk %llu", bg->start); - reclaimed = 0; + used = 0; + reserved = 0; spin_lock(&space_info->lock); space_info->reclaim_errors++; if (READ_ONCE(space_info->periodic_reclaim)) @@ -1940,24 +1993,13 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) } spin_lock(&space_info->lock); space_info->reclaim_count++; - space_info->reclaim_bytes += reclaimed; + space_info->reclaim_bytes += used; + space_info->reclaim_bytes += reserved; spin_unlock(&space_info->lock); next: - if (ret && !READ_ONCE(space_info->periodic_reclaim)) { - /* Refcount held by the reclaim_bgs list after splice. */ - spin_lock(&fs_info->unused_bgs_lock); - /* - * This block group might be added to the unused list - * during the above process. Move it back to the - * reclaim list otherwise. - */ - if (list_empty(&bg->bg_list)) { - btrfs_get_block_group(bg); - list_add_tail(&bg->bg_list, &retry_list); - } - spin_unlock(&fs_info->unused_bgs_lock); - } + if (ret && !READ_ONCE(space_info->periodic_reclaim)) + btrfs_link_bg_list(bg, &retry_list); btrfs_put_block_group(bg); mutex_unlock(&fs_info->reclaim_bgs_lock); @@ -1997,13 +2039,8 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = bg->fs_info; - spin_lock(&fs_info->unused_bgs_lock); - if (list_empty(&bg->bg_list)) { - btrfs_get_block_group(bg); + if (btrfs_link_bg_list(bg, &fs_info->reclaim_bgs)) trace_btrfs_add_reclaim_block_group(bg); - list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs); - } - spin_unlock(&fs_info->unused_bgs_lock); } static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key *key, @@ -2186,9 +2223,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) if (cache->start < BTRFS_SUPER_INFO_OFFSET) { stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start; cache->bytes_super += stripe_len; - ret = set_extent_bit(&fs_info->excluded_extents, cache->start, - cache->start + stripe_len - 1, - EXTENT_UPTODATE, NULL); + ret = btrfs_set_extent_bit(&fs_info->excluded_extents, cache->start, + cache->start + stripe_len - 1, + EXTENT_DIRTY, NULL); if (ret) return ret; } @@ -2214,9 +2251,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) cache->start + cache->length - logical[nr]); cache->bytes_super += len; - ret = set_extent_bit(&fs_info->excluded_extents, logical[nr], - logical[nr] + len - 1, - EXTENT_UPTODATE, NULL); + ret = btrfs_set_extent_bit(&fs_info->excluded_extents, + logical[nr], logical[nr] + len - 1, + EXTENT_DIRTY, NULL); if (ret) { kfree(logical); return ret; @@ -2341,8 +2378,9 @@ static int read_one_block_group(struct btrfs_fs_info *info, cache->commit_used = cache->used; cache->flags = btrfs_stack_block_group_flags(bgi); cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); + cache->space_info = btrfs_find_space_info(info, cache->flags); - set_free_space_tree_thresholds(cache); + btrfs_set_free_space_tree_thresholds(cache); if (need_clear) { /* @@ -2414,11 +2452,12 @@ static int read_one_block_group(struct btrfs_fs_info *info, goto error; } - ret = btrfs_add_block_group_cache(info, cache); + ret = btrfs_add_block_group_cache(cache); if (ret) { btrfs_remove_free_space_cache(cache); goto error; } + trace_btrfs_add_block_group(info, cache, 0); btrfs_add_bg_to_space_info(info, cache); @@ -2463,7 +2502,8 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) bg->cached = BTRFS_CACHE_FINISHED; bg->used = map->chunk_len; bg->flags = map->type; - ret = btrfs_add_block_group_cache(fs_info, bg); + bg->space_info = btrfs_find_space_info(fs_info, bg->flags); + ret = btrfs_add_block_group_cache(bg); /* * We may have some valid block group cache added already, in * that case we skip to the next one. @@ -2513,8 +2553,8 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) return fill_dummy_bgs(info); key.objectid = 0; - key.offset = 0; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + key.offset = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2645,7 +2685,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dev_extent *extent; struct extent_buffer *leaf; struct btrfs_key key; @@ -2662,7 +2702,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans, key.offset = start; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); if (ret) - goto out; + return ret; leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); @@ -2670,11 +2710,8 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans, btrfs_set_dev_extent_chunk_objectid(leaf, extent, BTRFS_FIRST_CHUNK_TREE_OBJECTID); btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); - btrfs_set_dev_extent_length(leaf, extent, num_bytes); - btrfs_mark_buffer_dirty(trans, leaf); -out: - btrfs_free_path(path); + return ret; } @@ -2762,7 +2799,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) block_group->length); if (ret) btrfs_abort_transaction(trans, ret); - add_block_group_free_space(trans, block_group); + btrfs_add_block_group_free_space(trans, block_group); /* * If we restriped during balance, we may have added a new raid @@ -2776,8 +2813,12 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) /* Already aborted the transaction if it failed. */ next: btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); + + spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags); + btrfs_put_block_group(block_group); + spin_unlock(&fs_info->unused_bgs_lock); /* * If the block group is still unused, add it to the list of @@ -2835,8 +2876,8 @@ static u64 calculate_global_root_id(const struct btrfs_fs_info *fs_info, u64 off } struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, - u64 type, - u64 chunk_offset, u64 size) + struct btrfs_space_info *space_info, + u64 type, u64 chunk_offset, u64 size) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *cache; @@ -2856,7 +2897,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags); cache->length = size; - set_free_space_tree_thresholds(cache); + btrfs_set_free_space_tree_thresholds(cache); cache->flags = type; cache->cached = BTRFS_CACHE_FINISHED; cache->global_root_id = calculate_global_root_id(fs_info, cache->start); @@ -2890,10 +2931,10 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran * assigned to our block group. We want our bg to be added to the rbtree * with its ->space_info set. */ - cache->space_info = btrfs_find_space_info(fs_info, cache->flags); + cache->space_info = space_info; ASSERT(cache->space_info); - ret = btrfs_add_block_group_cache(fs_info, cache); + ret = btrfs_add_block_group_cache(cache); if (ret) { btrfs_remove_free_space_cache(cache); btrfs_put_block_group(cache); @@ -2915,7 +2956,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran } #endif - list_add_tail(&cache->bg_list, &trans->new_bgs); + btrfs_link_bg_list(cache, &trans->new_bgs); btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info); set_avail_alloc_bits(fs_info, type); @@ -2935,6 +2976,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, bool do_chunk_alloc) { struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_space_info *space_info = cache->space_info; struct btrfs_trans_handle *trans; struct btrfs_root *root = btrfs_block_group_root(fs_info); u64 alloc_flags; @@ -2987,7 +3029,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, */ alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); if (alloc_flags != cache->flags) { - ret = btrfs_chunk_alloc(trans, alloc_flags, + ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); /* * ENOSPC is allowed here, we may have enough space @@ -3015,15 +3057,15 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM)) goto unlock_out; - alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags); - ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags); + ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; /* * We have allocated a new chunk. We also need to activate that chunk to * grant metadata tickets for zoned filesystem. */ - ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true); + ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true); if (ret < 0) goto out; @@ -3060,8 +3102,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) (cache->alloc_offset - cache->used - cache->pinned - cache->reserved) + (cache->length - cache->zone_capacity); - btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo, - cache->zone_unusable); + btrfs_space_info_update_bytes_zone_unusable(sinfo, cache->zone_unusable); sinfo->bytes_readonly -= cache->zone_unusable; } num_bytes = cache->length - cache->reserved - @@ -3123,7 +3164,6 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, cache->global_root_id); btrfs_set_stack_block_group_flags(&bgi, cache->flags); write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); - btrfs_mark_buffer_dirty(trans, leaf); fail: btrfs_release_path(path); /* @@ -3313,7 +3353,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *cache, *tmp; struct btrfs_transaction *cur_trans = trans->transaction; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); if (list_empty(&cur_trans->dirty_bgs) || !btrfs_test_opt(fs_info, SPACE_CACHE)) @@ -3330,7 +3370,6 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) cache_save_setup(cache, trans, path); } - btrfs_free_path(path); return 0; } @@ -3353,7 +3392,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; int should_put; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); LIST_HEAD(dirty); struct list_head *io = &cur_trans->io_bgs; int loops = 0; @@ -3508,7 +3547,6 @@ out: btrfs_cleanup_dirty_bgs(cur_trans, fs_info); } - btrfs_free_path(path); return ret; } @@ -3519,7 +3557,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; int should_put; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct list_head *io = &cur_trans->io_bgs; path = btrfs_alloc_path(); @@ -3606,9 +3644,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); ret = update_block_group_item(trans, path, cache); - } - if (ret) + if (ret) + btrfs_abort_transaction(trans, ret); + } else if (ret) { btrfs_abort_transaction(trans, ret); + } } /* If its not on the io list, we need to put the block group */ @@ -3631,7 +3671,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) btrfs_put_block_group(cache); } - btrfs_free_path(path); return ret; } @@ -3699,7 +3738,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, old_val -= num_bytes; cache->used = old_val; cache->pinned += num_bytes; - btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes); + btrfs_space_info_update_bytes_pinned(space_info, num_bytes); space_info->bytes_used -= num_bytes; space_info->disk_used -= num_bytes * factor; if (READ_ONCE(space_info->periodic_reclaim)) @@ -3710,8 +3749,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&space_info->lock); - set_extent_bit(&trans->transaction->pinned_extents, bytenr, - bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); + btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr, + bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); } spin_lock(&trans->transaction->dirty_bgs_lock); @@ -3781,8 +3820,7 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, space_info->bytes_reserved += num_bytes; trace_btrfs_space_reservation(cache->fs_info, "space_info", space_info->flags, num_bytes, 1); - btrfs_space_info_update_bytes_may_use(cache->fs_info, - space_info, -ram_bytes); + btrfs_space_info_update_bytes_may_use(space_info, -ram_bytes); if (delalloc) cache->delalloc_bytes += num_bytes; @@ -3801,17 +3839,17 @@ out: /* * Update the block_group and space info counters. * - * @cache: The cache we are manipulating - * @num_bytes: The number of bytes in question - * @delalloc: The blocks are allocated for the delalloc write + * @cache: The cache we are manipulating. + * @num_bytes: The number of bytes in question. + * @is_delalloc: Whether the blocks are allocated for a delalloc write. * * This is called by somebody who is freeing space that was never actually used * on disk. For example if you reserve some space for a new leaf in transaction * A and before transaction A commits you free that leaf, you call this with * reserve set to 0 in order to clear the reservation. */ -void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, - u64 num_bytes, int delalloc) +void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, + bool is_delalloc) { struct btrfs_space_info *space_info = cache->space_info; @@ -3825,7 +3863,7 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, space_info->bytes_reserved -= num_bytes; space_info->max_extent_size = 0; - if (delalloc) + if (is_delalloc) cache->delalloc_bytes -= num_bytes; spin_unlock(&cache->lock); @@ -3844,14 +3882,14 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) } } -static int should_alloc_chunk(const struct btrfs_fs_info *fs_info, - const struct btrfs_space_info *sinfo, int force) +static bool should_alloc_chunk(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *sinfo, int force) { u64 bytes_used = btrfs_space_info_used(sinfo, false); u64 thresh; if (force == CHUNK_ALLOC_FORCE) - return 1; + return true; /* * in limited mode, we want to have some free space up to @@ -3862,22 +3900,31 @@ static int should_alloc_chunk(const struct btrfs_fs_info *fs_info, thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1)); if (sinfo->total_bytes - bytes_used < thresh) - return 1; + return true; } if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80)) - return 0; - return 1; + return false; + return true; } int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) { u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type); + struct btrfs_space_info *space_info; - return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + space_info = btrfs_find_space_info(trans->fs_info, type); + if (!space_info) { + DEBUG_WARN(); + return -EINVAL; + } + + return btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); } -static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) +static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_space_info *space_info, + u64 flags) { struct btrfs_block_group *bg; int ret; @@ -3890,7 +3937,7 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans */ check_system_chunk(trans, flags); - bg = btrfs_create_chunk(trans, flags); + bg = btrfs_create_chunk(trans, space_info, flags); if (IS_ERR(bg)) { ret = PTR_ERR(bg); goto out; @@ -3938,8 +3985,16 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans if (ret == -ENOSPC) { const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info); struct btrfs_block_group *sys_bg; + struct btrfs_space_info *sys_space_info; - sys_bg = btrfs_create_chunk(trans, sys_flags); + sys_space_info = btrfs_find_space_info(trans->fs_info, sys_flags); + if (!sys_space_info) { + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } + + sys_bg = btrfs_create_chunk(trans, sys_space_info, sys_flags); if (IS_ERR(sys_bg)) { ret = PTR_ERR(sys_bg); btrfs_abort_transaction(trans, ret); @@ -4070,6 +4125,8 @@ out: * * This function, btrfs_chunk_alloc(), belongs to phase 1. * + * @space_info: specify which space_info the new chunk should belong to. + * * If @force is CHUNK_ALLOC_FORCE: * - return 1 if it successfully allocates a chunk, * - return errors including -ENOSPC otherwise. @@ -4078,11 +4135,11 @@ out: * - return 1 if it successfully allocates a chunk, * - return errors including -ENOSPC otherwise. */ -int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, +int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_space_info *space_info, u64 flags, enum btrfs_chunk_alloc_enum force) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_space_info *space_info; struct btrfs_block_group *ret_bg; bool wait_for_alloc = false; bool should_alloc = false; @@ -4121,9 +4178,6 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, if (flags & BTRFS_BLOCK_GROUP_SYSTEM) return -ENOSPC; - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - do { spin_lock(&space_info->lock); if (force < space_info->force_alloc) @@ -4184,7 +4238,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, force_metadata_allocation(fs_info); } - ret_bg = do_chunk_alloc(trans, flags); + ret_bg = do_chunk_alloc(trans, space_info, flags); trans->allocating_chunk = false; if (IS_ERR(ret_bg)) { @@ -4254,12 +4308,16 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", left, bytes, type); - btrfs_dump_space_info(fs_info, info, 0, 0); + btrfs_dump_space_info(fs_info, info, 0, false); } if (left < bytes) { u64 flags = btrfs_system_alloc_profile(fs_info); struct btrfs_block_group *bg; + struct btrfs_space_info *space_info; + + space_info = btrfs_find_space_info(fs_info, flags); + ASSERT(space_info); /* * Ignore failure to create system chunk. We might end up not @@ -4267,7 +4325,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, * the paths we visit in the chunk tree (they were already COWed * or created in the current transaction for example). */ - bg = btrfs_create_chunk(trans, flags); + bg = btrfs_create_chunk(trans, space_info, flags); if (IS_ERR(bg)) { ret = PTR_ERR(bg); } else { @@ -4375,6 +4433,43 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info) } } +static void check_removing_space_info(struct btrfs_space_info *space_info) +{ + struct btrfs_fs_info *info = space_info->fs_info; + + if (space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY) { + /* This is a top space_info, proceed with its children first. */ + for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) { + if (space_info->sub_group[i]) { + check_removing_space_info(space_info->sub_group[i]); + kfree(space_info->sub_group[i]); + space_info->sub_group[i] = NULL; + } + } + } + + /* + * Do not hide this behind enospc_debug, this is actually important and + * indicates a real bug if this happens. + */ + if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_may_use > 0)) + btrfs_dump_space_info(info, space_info, 0, false); + + /* + * If there was a failure to cleanup a log tree, very likely due to an + * IO failure on a writeback attempt of one or more of its extent + * buffers, we could not do proper (and cheap) unaccounting of their + * reserved space, so don't warn on bytes_reserved > 0 in that case. + */ + if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || + !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { + if (WARN_ON(space_info->bytes_reserved > 0)) + btrfs_dump_space_info(info, space_info, 0, false); + } + + WARN_ON(space_info->reclaim_size > 0); +} + /* * Must be called only after stopping all workers, since we could have block * group caching kthreads running, and therefore they could race with us if we @@ -4400,8 +4495,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) write_lock(&info->block_group_cache_lock); while (!list_empty(&info->caching_block_groups)) { - caching_ctl = list_entry(info->caching_block_groups.next, - struct btrfs_caching_control, list); + caching_ctl = list_first_entry(&info->caching_block_groups, + struct btrfs_caching_control, list); list_del(&caching_ctl->list); btrfs_put_caching_control(caching_ctl); } @@ -4472,32 +4567,10 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) btrfs_release_global_block_rsv(info); while (!list_empty(&info->space_info)) { - space_info = list_entry(info->space_info.next, - struct btrfs_space_info, - list); - - /* - * Do not hide this behind enospc_debug, this is actually - * important and indicates a real bug if this happens. - */ - if (WARN_ON(space_info->bytes_pinned > 0 || - space_info->bytes_may_use > 0)) - btrfs_dump_space_info(info, space_info, 0, 0); - - /* - * If there was a failure to cleanup a log tree, very likely due - * to an IO failure on a writeback attempt of one or more of its - * extent buffers, we could not do proper (and cheap) unaccounting - * of their reserved space, so don't warn on bytes_reserved > 0 in - * that case. - */ - if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || - !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { - if (WARN_ON(space_info->bytes_reserved > 0)) - btrfs_dump_space_info(info, space_info, 0, 0); - } + space_info = list_first_entry(&info->space_info, + struct btrfs_space_info, list); - WARN_ON(space_info->reclaim_size > 0); + check_removing_space_info(space_info); list_del(&space_info->list); btrfs_sysfs_remove_space_info(space_info); } diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 36937eeab9b8..a8bb8429c966 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -83,6 +83,8 @@ enum btrfs_block_group_flags { BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, /* Does the block group need to be added to the free space tree? */ BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, + /* Set after we add a new block group to the free space tree. */ + BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, /* Indicate that the block group is placed on a sequential zone */ BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, /* @@ -244,6 +246,11 @@ struct btrfs_block_group { /* Lock for free space tree operations. */ struct mutex free_space_lock; + /* Protected by @free_space_lock. */ + bool using_free_space_bitmaps; + /* Protected by @free_space_lock. */ + bool using_free_space_bitmaps_cached; + /* * Number of extents in this block group used for swap files. * All accesses protected by the spinlock 'lock'. @@ -326,8 +333,8 @@ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info); void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg); int btrfs_read_block_groups(struct btrfs_fs_info *info); struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, - u64 type, - u64 chunk_offset, u64 size); + struct btrfs_space_info *space_info, + u64 type, u64 chunk_offset, u64 size); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans); int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, bool do_chunk_alloc); @@ -340,9 +347,10 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, u64 ram_bytes, u64 num_bytes, int delalloc, bool force_wrong_size_class); -void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, - u64 num_bytes, int delalloc); -int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, +void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, + bool is_delalloc); +int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_space_info *space_info, u64 flags, enum btrfs_chunk_alloc_enum force); int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type); void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type); diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index a07b9594dc70..5ad6de738aee 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -150,9 +150,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, spin_unlock(&dest->lock); } if (num_bytes) - btrfs_space_info_free_bytes_may_use(fs_info, - space_info, - num_bytes); + btrfs_space_info_free_bytes_may_use(space_info, num_bytes); } if (qgroup_to_release_ret) *qgroup_to_release_ret = qgroup_to_release; @@ -383,13 +381,11 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) if (block_rsv->reserved < block_rsv->size) { num_bytes = block_rsv->size - block_rsv->reserved; - btrfs_space_info_update_bytes_may_use(fs_info, sinfo, - num_bytes); + btrfs_space_info_update_bytes_may_use(sinfo, num_bytes); block_rsv->reserved = block_rsv->size; } else if (block_rsv->reserved > block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; - btrfs_space_info_update_bytes_may_use(fs_info, sinfo, - -num_bytes); + btrfs_space_info_update_bytes_may_use(sinfo, -num_bytes); block_rsv->reserved = block_rsv->size; btrfs_try_granting_tickets(fs_info, sinfo); } @@ -422,6 +418,9 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root) case BTRFS_CHUNK_TREE_OBJECTID: root->block_rsv = &fs_info->chunk_block_rsv; break; + case BTRFS_TREE_LOG_OBJECTID: + root->block_rsv = &fs_info->treelog_rsv; + break; default: root->block_rsv = NULL; break; @@ -442,6 +441,14 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->delayed_block_rsv.space_info = space_info; fs_info->delayed_refs_rsv.space_info = space_info; + /* The treelog_rsv uses a dedicated space_info on the zoned mode. */ + if (!btrfs_is_zoned(fs_info)) { + fs_info->treelog_rsv.space_info = space_info; + } else { + ASSERT(space_info->sub_group[0]->subgroup_id == BTRFS_SUB_GROUP_TREELOG); + fs_info->treelog_rsv.space_info = space_info->sub_group[0]; + } + btrfs_update_global_block_rsv(fs_info); } diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index d12b1fac5c74..79ae9d05cd91 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -24,6 +24,7 @@ enum btrfs_rsv_type { BTRFS_BLOCK_RSV_CHUNK, BTRFS_BLOCK_RSV_DELOPS, BTRFS_BLOCK_RSV_DELREFS, + BTRFS_BLOCK_RSV_TREELOG, BTRFS_BLOCK_RSV_EMPTY, BTRFS_BLOCK_RSV_TEMP, }; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index aa1f55cd81b7..b99fb0273292 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -145,6 +145,7 @@ struct btrfs_inode { * different from prop_compress and takes precedence if set. */ u8 defrag_compress; + s8 defrag_compress_level; /* * Lock for counters and all fields used to determine if the inode is in @@ -516,17 +517,38 @@ static inline void btrfs_assert_inode_locked(struct btrfs_inode *inode) lockdep_assert_held(&inode->vfs_inode.i_rwsem); } +static inline void btrfs_update_inode_mapping_flags(struct btrfs_inode *inode) +{ + if (inode->flags & BTRFS_INODE_NODATASUM) + mapping_clear_stable_writes(inode->vfs_inode.i_mapping); + else + mapping_set_stable_writes(inode->vfs_inode.i_mapping); +} + +static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode) +{ + /* Metadata inode should not reach here. */ + ASSERT(is_data_inode(inode)); + + /* We only allow BITS_PER_LONGS blocks for each bitmap. */ +#ifdef CONFIG_BTRFS_EXPERIMENTAL + mapping_set_folio_order_range(inode->vfs_inode.i_mapping, 0, + ilog2(((BITS_PER_LONG << inode->root->fs_info->sectorsize_bits) + >> PAGE_SHIFT))); +#endif +} + /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes -int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, - u32 pgoff, u8 *csum, const u8 * const csum_expected); +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum, + const u8 * const csum_expected); bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u32 bio_offset, struct bio_vec *bv); -noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, +noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, - bool nowait, bool strict); + bool nowait); void btrfs_del_delalloc_inode(struct btrfs_inode *inode); struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); @@ -538,8 +560,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, const struct fscrypt_str *name, int add_backref, u64 index); int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry); -int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, - int front); +int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end); int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, @@ -584,9 +605,9 @@ void btrfs_free_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); int __init btrfs_init_cachep(void); void __cold btrfs_destroy_cachep(void); -struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, - struct btrfs_path *path); -struct inode *btrfs_iget(u64 ino, struct btrfs_root *root); +struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, + struct btrfs_path *path); +struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct folio *folio, u64 start, u64 len); int btrfs_update_inode(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 0c4d486c3048..d09d622016ef 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -282,15 +282,15 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb) { struct inode *inode = &cb->bbio.inode->vfs_inode; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - unsigned long index = cb->start >> PAGE_SHIFT; - unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; + pgoff_t index = cb->start >> PAGE_SHIFT; + const pgoff_t end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; struct folio_batch fbatch; - const int error = blk_status_to_errno(cb->bbio.bio.bi_status); int i; int ret; - if (error) - mapping_set_error(inode->i_mapping, error); + ret = blk_status_to_errno(cb->bbio.bio.bi_status); + if (ret) + mapping_set_error(inode->i_mapping, ret); folio_batch_init(&fbatch); while (index <= end_index) { @@ -415,7 +415,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, int *memstall, unsigned long *pflags) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - unsigned long end_index; + pgoff_t end_index; struct bio *orig_bio = &cb->orig_bbio->bio; u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size; u64 isize = i_size_read(inode); @@ -446,8 +446,8 @@ static noinline int add_ra_bio_pages(struct inode *inode, end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; while (cur < compressed_end) { - u64 page_end; - u64 pg_index = cur >> PAGE_SHIFT; + pgoff_t page_end; + pgoff_t pg_index = cur >> PAGE_SHIFT; u32 add_size; if (pg_index > end_index) @@ -499,9 +499,9 @@ static noinline int add_ra_bio_pages(struct inode *inode, } page_end = (pg_index << PAGE_SHIFT) + folio_size(folio) - 1; - lock_extent(tree, cur, page_end, NULL); + btrfs_lock_extent(tree, cur, page_end, NULL); read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur); + em = btrfs_lookup_extent_mapping(em_tree, cur, page_end + 1 - cur); read_unlock(&em_tree->lock); /* @@ -510,20 +510,20 @@ static noinline int add_ra_bio_pages(struct inode *inode, * to this compressed extent on disk. */ if (!em || cur < em->start || - (cur + fs_info->sectorsize > extent_map_end(em)) || - (extent_map_block_start(em) >> SECTOR_SHIFT) != + (cur + fs_info->sectorsize > btrfs_extent_map_end(em)) || + (btrfs_extent_map_block_start(em) >> SECTOR_SHIFT) != orig_bio->bi_iter.bi_sector) { - free_extent_map(em); - unlock_extent(tree, cur, page_end, NULL); + btrfs_free_extent_map(em); + btrfs_unlock_extent(tree, cur, page_end, NULL); folio_unlock(folio); folio_put(folio); break; } add_size = min(em->start + em->len, page_end + 1) - cur; - free_extent_map(em); - unlock_extent(tree, cur, page_end, NULL); + btrfs_free_extent_map(em); + btrfs_unlock_extent(tree, cur, page_end, NULL); - if (folio->index == end_index) { + if (folio_contains(folio, end_index)) { size_t zero_offset = offset_in_folio(folio, isize); if (zero_offset) { @@ -576,19 +576,19 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) struct extent_map *em; unsigned long pflags; int memstall = 0; - blk_status_t ret; - int ret2; + blk_status_t status; + int ret; /* we need the actual starting offset of this extent in the file */ read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize); + em = btrfs_lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize); read_unlock(&em_tree->lock); if (!em) { - ret = BLK_STS_IOERR; + status = BLK_STS_IOERR; goto out; } - ASSERT(extent_map_is_compressed(em)); + ASSERT(btrfs_extent_map_is_compressed(em)); compressed_len = em->disk_num_bytes; cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ, @@ -600,21 +600,21 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) cb->len = bbio->bio.bi_iter.bi_size; cb->compressed_len = compressed_len; - cb->compress_type = extent_map_compression(em); + cb->compress_type = btrfs_extent_map_compression(em); cb->orig_bbio = bbio; - free_extent_map(em); + btrfs_free_extent_map(em); cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE); - cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct page *), GFP_NOFS); + cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS); if (!cb->compressed_folios) { - ret = BLK_STS_RESOURCE; + status = BLK_STS_RESOURCE; goto out_free_bio; } - ret2 = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios); - if (ret2) { - ret = BLK_STS_RESOURCE; + ret = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios); + if (ret) { + status = BLK_STS_RESOURCE; goto out_free_compressed_pages; } @@ -637,7 +637,7 @@ out_free_compressed_pages: out_free_bio: bio_put(&cb->bbio.bio); out: - btrfs_bio_end_io(bbio, ret); + btrfs_bio_end_io(bbio, status); } /* @@ -740,7 +740,7 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zstd_compress, }; -static struct list_head *alloc_workspace(int type, unsigned int level) +static struct list_head *alloc_workspace(int type, int level) { switch (type) { case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(); @@ -789,8 +789,8 @@ static void btrfs_init_workspace_manager(int type) */ workspace = alloc_workspace(type, 0); if (IS_ERR(workspace)) { - pr_warn( - "BTRFS: cannot preallocate compression workspace, will try later\n"); + btrfs_warn(NULL, + "cannot preallocate compression workspace, will try later"); } else { atomic_set(&wsm->total_ws, 1); wsm->free_ws = 1; @@ -818,7 +818,7 @@ static void btrfs_cleanup_workspace_manager(int type) * Preallocation makes a forward progress guarantees and we do not return * errors. */ -struct list_head *btrfs_get_workspace(int type, unsigned int level) +struct list_head *btrfs_get_workspace(int type, int level) { struct workspace_manager *wsm; struct list_head *workspace; @@ -888,9 +888,9 @@ again: /* once per minute */ 60 * HZ, /* no burst */ 1); - if (__ratelimit(&_rs)) { - pr_warn("BTRFS: no compression workspaces, low memory, retrying\n"); - } + if (__ratelimit(&_rs)) + btrfs_warn(NULL, + "no compression workspaces, low memory, retrying"); } goto again; } @@ -968,18 +968,28 @@ static void put_workspace(int type, struct list_head *ws) * Adjust @level according to the limits of the compression algorithm or * fallback to default */ -static unsigned int btrfs_compress_set_level(int type, unsigned level) +static int btrfs_compress_set_level(unsigned int type, int level) { const struct btrfs_compress_op *ops = btrfs_compress_op[type]; if (level == 0) level = ops->default_level; else - level = min(level, ops->max_level); + level = clamp(level, ops->min_level, ops->max_level); return level; } +/* + * Check whether the @level is within the valid range for the given type. + */ +bool btrfs_compress_level_valid(unsigned int type, int level) +{ + const struct btrfs_compress_op *ops = btrfs_compress_op[type]; + + return ops->min_level <= level && level <= ops->max_level; +} + /* Wrapper around find_get_page(), with extra error message. */ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, struct folio **in_folio_ret) @@ -1023,12 +1033,10 @@ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, * @total_out is an in/out parameter, must be set to the input length and will * be also used to return the total number of compressed bytes */ -int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping, +int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { - int type = btrfs_compress_type(type_level); - int level = btrfs_compress_level(type_level); const unsigned long orig_len = *total_out; struct list_head *workspace; int ret; @@ -1130,6 +1138,22 @@ void __cold btrfs_exit_compress(void) } /* + * The bvec is a single page bvec from a bio that contains folios from a filemap. + * + * Since the folio may be a large one, and if the bv_page is not a head page of + * a large folio, then page->index is unreliable. + * + * Thus we need this helper to grab the proper file offset. + */ +static u64 file_offset_from_bvec(const struct bio_vec *bvec) +{ + const struct page *page = bvec->bv_page; + const struct folio *folio = page_folio(page); + + return (page_pgoff(folio, page) << PAGE_SHIFT) + bvec->bv_offset; +} + +/* * Copy decompressed data from working buffer to pages. * * @buf: The decompressed data buffer @@ -1174,13 +1198,14 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len, u32 copy_start; /* Offset inside the full decompressed extent */ u32 bvec_offset; + void *kaddr; bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter); /* * cb->start may underflow, but subtracting that value can still * give us correct offset inside the full decompressed extent. */ - bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start; + bvec_offset = file_offset_from_bvec(&bvec) - cb->start; /* Haven't reached the bvec range, exit */ if (decompressed + buf_len <= bvec_offset) @@ -1196,10 +1221,12 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len, * @buf + @buf_len. */ ASSERT(copy_start - decompressed < buf_len); - memcpy_to_page(bvec.bv_page, bvec.bv_offset, - buf + copy_start - decompressed, copy_len); - cur_offset += copy_len; + kaddr = bvec_kmap_local(&bvec); + memcpy(kaddr, buf + copy_start - decompressed, copy_len); + kunmap_local(kaddr); + + cur_offset += copy_len; bio_advance(orig_bio, copy_len); /* Finished the bio */ if (!orig_bio->bi_iter.bi_size) @@ -1455,7 +1482,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, struct heuristic_ws *ws) { struct page *page; - u64 index, index_end; + pgoff_t index, index_end; u32 i, curr_sample_pos; u8 *in_data; @@ -1590,18 +1617,19 @@ out: /* * Convert the compression suffix (eg. after "zlib" starting with ":") to - * level, unrecognized string will set the default level + * level, unrecognized string will set the default level. Negative level + * numbers are allowed. */ -unsigned int btrfs_compress_str2level(unsigned int type, const char *str) +int btrfs_compress_str2level(unsigned int type, const char *str) { - unsigned int level = 0; + int level = 0; int ret; if (!type) return 0; if (str[0] == ':') { - ret = kstrtouint(str + 1, 10, &level); + ret = kstrtoint(str + 1, 10, &level); if (ret) level = 0; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 954034086d0d..1b38e707bbd9 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -11,7 +11,10 @@ #include <linux/list.h> #include <linux/workqueue.h> #include <linux/wait.h> +#include <linux/pagemap.h> #include "bio.h" +#include "fs.h" +#include "messages.h" struct address_space; struct page; @@ -72,28 +75,20 @@ struct compressed_bio { struct btrfs_bio bbio; }; -static inline unsigned int btrfs_compress_type(unsigned int type_level) -{ - return (type_level & 0xF); -} - -static inline unsigned int btrfs_compress_level(unsigned int type_level) -{ - return ((type_level & 0xF0) >> 4); -} - /* @range_end must be exclusive. */ -static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur) +static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u64 cur) { - u64 page_end = round_down(cur, PAGE_SIZE) + PAGE_SIZE; - - return min(range_end, page_end) - cur; + /* @cur must be inside the folio. */ + ASSERT(folio_pos(folio) <= cur); + ASSERT(cur < folio_end(folio)); + return min(range_end, folio_end(folio)) - cur; } int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); -int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping, +bool btrfs_compress_level_valid(unsigned int type, int level); +int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, @@ -107,7 +102,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, bool writeback); void btrfs_submit_compressed_read(struct btrfs_bio *bbio); -unsigned int btrfs_compress_str2level(unsigned int type, const char *str); +int btrfs_compress_str2level(unsigned int type, const char *str); struct folio *btrfs_alloc_compr_folio(void); void btrfs_free_compr_folio(struct folio *folio); @@ -118,6 +113,8 @@ enum btrfs_compression_type { BTRFS_COMPRESS_LZO = 2, BTRFS_COMPRESS_ZSTD = 3, BTRFS_NR_COMPRESS_TYPES = 4, + + BTRFS_DEFRAG_DONT_COMPRESS, }; struct workspace_manager { @@ -131,14 +128,15 @@ struct workspace_manager { wait_queue_head_t ws_wait; }; -struct list_head *btrfs_get_workspace(int type, unsigned int level); +struct list_head *btrfs_get_workspace(int type, int level); void btrfs_put_workspace(int type, struct list_head *ws); struct btrfs_compress_op { struct workspace_manager *workspace_manager; /* Maximum level supported by the compression algorithm */ - unsigned int max_level; - unsigned int default_level; + int min_level; + int max_level; + int default_level; }; /* The heuristic workspaces are managed via the 0th workspace manager */ @@ -187,9 +185,9 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in, size_t destlen); void zstd_init_workspace_manager(void); void zstd_cleanup_workspace_manager(void); -struct list_head *zstd_alloc_workspace(unsigned int level); +struct list_head *zstd_alloc_workspace(int level); void zstd_free_workspace(struct list_head *ws); -struct list_head *zstd_get_workspace(unsigned int level); +struct list_head *zstd_get_workspace(int level); void zstd_put_workspace(struct list_head *ws); #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 148648ea1c8b..74e6d7f3d266 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -37,19 +37,6 @@ static int push_node_left(struct btrfs_trans_handle *trans, static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); - -static const struct btrfs_csums { - u16 size; - const char name[10]; - const char driver[12]; -} btrfs_csums[] = { - [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, - [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, - [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, - [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", - .driver = "blake2b-256" }, -}; - /* * The leaf data grows from end-to-front in the node. this returns the address * of the start of the last item, which is the stop of the leaf data stack. @@ -148,44 +135,6 @@ static inline void copy_leaf_items(const struct extent_buffer *dst, nr_items * sizeof(struct btrfs_item)); } -/* This exists for btrfs-progs usages. */ -u16 btrfs_csum_type_size(u16 type) -{ - return btrfs_csums[type].size; -} - -int btrfs_super_csum_size(const struct btrfs_super_block *s) -{ - u16 t = btrfs_super_csum_type(s); - /* - * csum type is validated at mount time - */ - return btrfs_csum_type_size(t); -} - -const char *btrfs_super_csum_name(u16 csum_type) -{ - /* csum type is validated at mount time */ - return btrfs_csums[csum_type].name; -} - -/* - * Return driver name if defined, otherwise the name that's also a valid driver - * name - */ -const char *btrfs_super_csum_driver(u16 csum_type) -{ - /* csum type is validated at mount time */ - return btrfs_csums[csum_type].driver[0] ? - btrfs_csums[csum_type].driver : - btrfs_csums[csum_type].name; -} - -size_t __attribute_const__ btrfs_get_num_csums(void) -{ - return ARRAY_SIZE(btrfs_csums); -} - struct btrfs_path *btrfs_alloc_path(void) { might_sleep(); @@ -226,22 +175,6 @@ noinline void btrfs_release_path(struct btrfs_path *p) } /* - * We want the transaction abort to print stack trace only for errors where the - * cause could be a bug, eg. due to ENOSPC, and not for common errors that are - * caused by external factors. - */ -bool __cold abort_should_print_stack(int error) -{ - switch (error) { - case -EIO: - case -EROFS: - case -ENOMEM: - return false; - } - return true; -} - -/* * safely gets a reference on the root node of a tree. A lock * is not taken, so a concurrent writer may put a different node * at the root of the tree. See btrfs_lock_root_node for the @@ -265,7 +198,7 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) * the inc_not_zero dance and if it doesn't work then * synchronize_rcu and try again. */ - if (atomic_inc_not_zero(&eb->refs)) { + if (refcount_inc_not_zero(&eb->refs)) { rcu_read_unlock(); break; } @@ -350,15 +283,26 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); - WARN_ON(btrfs_header_generation(buf) > trans->transid); - if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + if (unlikely(btrfs_header_generation(buf) > trans->transid)) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + return ret; + } + + if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) { ret = btrfs_inc_ref(trans, root, cow, 1); - else + if (ret) + btrfs_abort_transaction(trans, ret); + } else { ret = btrfs_inc_ref(trans, root, cow, 0); + if (ret) + btrfs_abort_transaction(trans, ret); + } if (ret) { btrfs_tree_unlock(cow); free_extent_buffer(cow); - btrfs_abort_transaction(trans, ret); return ret; } @@ -370,9 +314,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, /* * check if the tree block can be shared by multiple trees */ -bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf) +bool btrfs_block_can_be_shared(const struct btrfs_trans_handle *trans, + const struct btrfs_root *root, + const struct extent_buffer *buf) { const u64 buf_gen = btrfs_header_generation(buf); @@ -616,7 +560,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } - atomic_inc(&cow->refs); + refcount_inc(&cow->refs); rcu_assign_pointer(root->node, cow); ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf, @@ -654,6 +598,8 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, goto error_unlock_cow; } } + + trace_btrfs_cow_block(root, buf, cow); if (unlock_orig) btrfs_tree_unlock(buf); free_extent_buffer_stale(buf); @@ -667,9 +613,9 @@ error_unlock_cow: return ret; } -static inline int should_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf) +static inline int should_cow_block(const struct btrfs_trans_handle *trans, + const struct btrfs_root *root, + const struct extent_buffer *buf) { if (btrfs_is_testing(root->fs_info)) return 0; @@ -710,7 +656,6 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; u64 search_start; - int ret; if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { btrfs_abort_transaction(trans, -EUCLEAN); @@ -751,12 +696,8 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, * Also We don't care about the error, as it's handled internally. */ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); - ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot, - cow_ret, search_start, 0, nest); - - trace_btrfs_cow_block(root, buf, *cow_ret); - - return ret; + return btrfs_force_cow_block(trans, root, buf, parent, parent_slot, + cow_ret, search_start, 0, nest); } ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); @@ -794,7 +735,7 @@ int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_ke * Slot may point to the total number of items (i.e. one position beyond the last * key) if the key is bigger than the last key in the extent buffer. */ -int btrfs_bin_search(struct extent_buffer *eb, int first_slot, +int btrfs_bin_search(const struct extent_buffer *eb, int first_slot, const struct btrfs_key *key, int *slot) { unsigned long p; @@ -1151,7 +1092,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* update the path */ if (left) { if (btrfs_header_nritems(left) > orig_slot) { - atomic_inc(&left->refs); + refcount_inc(&left->refs); /* left was locked after cow */ path->nodes[level] = left; path->slots[level + 1] -= 1; @@ -1338,7 +1279,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, * to the block in 'slot', and triggering ra on them. */ static void reada_for_search(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, + const struct btrfs_path *path, int level, int slot, u64 objectid) { struct extent_buffer *node; @@ -1420,7 +1361,7 @@ static void reada_for_search(struct btrfs_fs_info *fs_info, } } -static noinline void reada_for_balance(struct btrfs_path *path, int level) +static noinline void reada_for_balance(const struct btrfs_path *path, int level) { struct extent_buffer *parent; int slot; @@ -1516,8 +1457,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, u64 blocknr; struct extent_buffer *tmp = NULL; int ret = 0; + int ret2; int parent_level; - int err; bool read_tmp = false; bool tmp_locked = false; bool path_released = false; @@ -1566,6 +1507,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (!p->skip_locking) { btrfs_unlock_up_safe(p, parent_level + 1); + btrfs_maybe_reset_lockdep_class(root, tmp); tmp_locked = true; btrfs_tree_read_lock(tmp); btrfs_release_path(p); @@ -1574,9 +1516,9 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, } /* Now we're allowed to do a blocking uptodate check. */ - err = btrfs_read_extent_buffer(tmp, &check); - if (err) { - ret = err; + ret2 = btrfs_read_extent_buffer(tmp, &check); + if (ret2) { + ret = ret2; goto out; } @@ -1609,6 +1551,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (!p->skip_locking) { ASSERT(ret == -EAGAIN); + btrfs_maybe_reset_lockdep_class(root, tmp); tmp_locked = true; btrfs_tree_read_lock(tmp); btrfs_release_path(p); @@ -1616,9 +1559,9 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, } /* Now we're allowed to do a blocking uptodate check. */ - err = btrfs_read_extent_buffer(tmp, &check); - if (err) { - ret = err; + ret2 = btrfs_read_extent_buffer(tmp, &check); + if (ret2) { + ret = ret2; goto out; } @@ -1753,7 +1696,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, if (p->search_commit_root) { b = root->commit_root; - atomic_inc(&b->refs); + refcount_inc(&b->refs); level = btrfs_header_level(b); /* * Ensure that all callers have set skip_locking when @@ -1862,7 +1805,7 @@ static int finish_need_commit_sem_search(struct btrfs_path *path) return 0; } -static inline int search_for_key_slot(struct extent_buffer *eb, +static inline int search_for_key_slot(const struct extent_buffer *eb, int search_low_slot, const struct btrfs_key *key, int prev_cmp, @@ -1996,15 +1939,14 @@ static int search_leaf(struct btrfs_trans_handle *trans, ASSERT(leaf_free_space >= 0); if (leaf_free_space < ins_len) { - int err; - - err = split_leaf(trans, root, key, path, ins_len, - (ret == 0)); - ASSERT(err <= 0); - if (WARN_ON(err > 0)) - err = -EUCLEAN; - if (err) - ret = err; + int ret2; + + ret2 = split_leaf(trans, root, key, path, ins_len, (ret == 0)); + ASSERT(ret2 <= 0); + if (WARN_ON(ret2 > 0)) + ret2 = -EUCLEAN; + if (ret2) + ret = ret2; } } @@ -2046,11 +1988,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info; struct extent_buffer *b; int slot; int ret; - int err; int level; int lowest_unlock = 1; /* everything at write_lock_level or lower must be write locked */ @@ -2059,6 +2000,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, int min_write_lock_level; int prev_cmp; + if (!root) + return -EINVAL; + + fs_info = root->fs_info; might_sleep(); lowest_level = p->lowest_level; @@ -2117,6 +2062,7 @@ again: while (b) { int dec = 0; + int ret2; level = btrfs_header_level(b); @@ -2145,16 +2091,15 @@ again: } if (last_level) - err = btrfs_cow_block(trans, root, b, NULL, 0, - &b, - BTRFS_NESTING_COW); + ret2 = btrfs_cow_block(trans, root, b, NULL, 0, + &b, BTRFS_NESTING_COW); else - err = btrfs_cow_block(trans, root, b, - p->nodes[level + 1], - p->slots[level + 1], &b, - BTRFS_NESTING_COW); - if (err) { - ret = err; + ret2 = btrfs_cow_block(trans, root, b, + p->nodes[level + 1], + p->slots[level + 1], &b, + BTRFS_NESTING_COW); + if (ret2) { + ret = ret2; goto done; } } @@ -2202,12 +2147,12 @@ cow_done: slot--; } p->slots[level] = slot; - err = setup_nodes_for_search(trans, root, p, b, level, ins_len, - &write_lock_level); - if (err == -EAGAIN) + ret2 = setup_nodes_for_search(trans, root, p, b, level, ins_len, + &write_lock_level); + if (ret2 == -EAGAIN) goto again; - if (err) { - ret = err; + if (ret2) { + ret = ret2; goto done; } b = p->nodes[level]; @@ -2233,11 +2178,11 @@ cow_done: goto done; } - err = read_block_for_search(root, p, &b, slot, key); - if (err == -EAGAIN && !p->nowait) + ret2 = read_block_for_search(root, p, &b, slot, key); + if (ret2 == -EAGAIN && !p->nowait) goto again; - if (err) { - ret = err; + if (ret2) { + ret = ret2; goto done; } @@ -2300,7 +2245,6 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, struct extent_buffer *b; int slot; int ret; - int err; int level; int lowest_unlock = 1; u8 lowest_level = 0; @@ -2325,6 +2269,7 @@ again: while (b) { int dec = 0; + int ret2; level = btrfs_header_level(b); p->nodes[level] = b; @@ -2360,11 +2305,11 @@ again: goto done; } - err = read_block_for_search(root, p, &b, slot, key); - if (err == -EAGAIN && !p->nowait) + ret2 = read_block_for_search(root, p, &b, slot, key); + if (ret2 == -EAGAIN && !p->nowait) goto again; - if (err) { - ret = err; + if (ret2) { + ret = ret2; goto done; } @@ -2936,6 +2881,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, if (ret < 0) { int ret2; + btrfs_clear_buffer_dirty(trans, c); ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); if (ret2 < 0) btrfs_abort_transaction(trans, ret2); @@ -2949,7 +2895,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, free_extent_buffer(old); add_root_to_dirty_list(root); - atomic_inc(&c->refs); + refcount_inc(&c->refs); path->nodes[level] = c; path->locks[level] = BTRFS_WRITE_LOCK; path->slots[level] = 0; @@ -3164,7 +3110,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = right->fs_info; struct extent_buffer *left = path->nodes[0]; struct extent_buffer *upper = path->nodes[1]; - struct btrfs_map_token token; struct btrfs_disk_key disk_key; int slot; u32 i; @@ -3238,13 +3183,12 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, copy_leaf_items(right, left, 0, left_nritems - push_items, push_items); /* update the item pointers */ - btrfs_init_map_token(&token, right); right_nritems += push_items; btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(fs_info); for (i = 0; i < right_nritems; i++) { - push_space -= btrfs_token_item_size(&token, i); - btrfs_set_token_item_offset(&token, i, push_space); + push_space -= btrfs_item_size(right, i); + btrfs_set_item_offset(right, i, push_space); } left_nritems -= push_items; @@ -3387,7 +3331,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, int ret = 0; u32 this_item_size; u32 old_left_item_size; - struct btrfs_map_token token; if (empty) nr = min(right_nritems, max_slot); @@ -3435,13 +3378,12 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, old_left_nritems = btrfs_header_nritems(left); BUG_ON(old_left_nritems <= 0); - btrfs_init_map_token(&token, left); old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1); for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { u32 ioff; - ioff = btrfs_token_item_offset(&token, i); - btrfs_set_token_item_offset(&token, i, + ioff = btrfs_item_offset(left, i); + btrfs_set_item_offset(left, i, ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size)); } btrfs_set_header_nritems(left, old_left_nritems + push_items); @@ -3462,13 +3404,12 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, btrfs_header_nritems(right) - push_items); } - btrfs_init_map_token(&token, right); right_nritems -= push_items; btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(fs_info); for (i = 0; i < right_nritems; i++) { - push_space = push_space - btrfs_token_item_size(&token, i); - btrfs_set_token_item_offset(&token, i, push_space); + push_space = push_space - btrfs_item_size(right, i); + btrfs_set_item_offset(right, i, push_space); } btrfs_mark_buffer_dirty(trans, left); @@ -3582,7 +3523,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, int i; int ret; struct btrfs_disk_key disk_key; - struct btrfs_map_token token; nritems = nritems - mid; btrfs_set_header_nritems(right, nritems); @@ -3595,12 +3535,11 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid); - btrfs_init_map_token(&token, right); for (i = 0; i < nritems; i++) { u32 ioff; - ioff = btrfs_token_item_offset(&token, i); - btrfs_set_token_item_offset(&token, i, ioff + rt_data_off); + ioff = btrfs_item_offset(right, i); + btrfs_set_item_offset(right, i, ioff + rt_data_off); } btrfs_set_header_nritems(l, mid); @@ -3899,6 +3838,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && + key.type != BTRFS_RAID_STRIPE_KEY && key.type != BTRFS_EXTENT_CSUM_KEY); if (btrfs_leaf_free_space(leaf) >= ins_len) @@ -4065,7 +4005,6 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans, unsigned int old_size; unsigned int size_diff; int i; - struct btrfs_map_token token; leaf = path->nodes[0]; slot = path->slots[0]; @@ -4088,12 +4027,11 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans, * item0..itemN ... dataN.offset..dataN.size .. data0.size */ /* first correct the data pointers */ - btrfs_init_map_token(&token, leaf); for (i = slot; i < nritems; i++) { u32 ioff; - ioff = btrfs_token_item_offset(&token, i); - btrfs_set_token_item_offset(&token, i, ioff + size_diff); + ioff = btrfs_item_offset(leaf, i); + btrfs_set_item_offset(leaf, i, ioff + size_diff); } /* shift the data */ @@ -4156,7 +4094,6 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, unsigned int old_data; unsigned int old_size; int i; - struct btrfs_map_token token; leaf = path->nodes[0]; @@ -4182,12 +4119,11 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, * item0..itemN ... dataN.offset..dataN.size .. data0.size */ /* first correct the data pointers */ - btrfs_init_map_token(&token, leaf); for (i = slot; i < nritems; i++) { u32 ioff; - ioff = btrfs_token_item_offset(&token, i); - btrfs_set_token_item_offset(&token, i, ioff - data_size); + ioff = btrfs_item_offset(leaf, i); + btrfs_set_item_offset(leaf, i, ioff - data_size); } /* shift the data */ @@ -4227,7 +4163,6 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; struct extent_buffer *leaf; int slot; - struct btrfs_map_token token; u32 total_size; /* @@ -4255,7 +4190,6 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans, BUG(); } - btrfs_init_map_token(&token, leaf); if (slot != nritems) { unsigned int old_data = btrfs_item_data_end(leaf, slot); @@ -4273,8 +4207,8 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans, for (i = slot; i < nritems; i++) { u32 ioff; - ioff = btrfs_token_item_offset(&token, i); - btrfs_set_token_item_offset(&token, i, + ioff = btrfs_item_offset(leaf, i); + btrfs_set_item_offset(leaf, i, ioff - batch->total_data_size); } /* shift the items */ @@ -4291,8 +4225,8 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans, btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]); btrfs_set_item_key(leaf, &disk_key, slot + i); data_end -= batch->data_sizes[i]; - btrfs_set_token_item_offset(&token, slot + i, data_end); - btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]); + btrfs_set_item_offset(leaf, slot + i, data_end); + btrfs_set_item_size(leaf, slot + i, batch->data_sizes[i]); } btrfs_set_header_nritems(leaf, nritems + batch->nr); @@ -4369,7 +4303,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 data_size) { int ret = 0; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; unsigned long ptr; @@ -4383,7 +4317,6 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, write_extent_buffer(leaf, data, ptr, data_size); btrfs_mark_buffer_dirty(trans, leaf); } - btrfs_free_path(path); return ret; } @@ -4506,7 +4439,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, root_sub_used_bytes(root); - atomic_inc(&leaf->refs); + refcount_inc(&leaf->refs); ret = btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); free_extent_buffer_stale(leaf); if (ret < 0) @@ -4533,7 +4466,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (slot + nr != nritems) { const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1); const int data_end = leaf_data_end(leaf); - struct btrfs_map_token token; u32 dsize = 0; int i; @@ -4543,12 +4475,11 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, memmove_leaf_data(leaf, data_end + dsize, data_end, last_off - data_end); - btrfs_init_map_token(&token, leaf); for (i = slot + nr; i < nritems; i++) { u32 ioff; - ioff = btrfs_token_item_offset(&token, i); - btrfs_set_token_item_offset(&token, i, ioff + dsize); + ioff = btrfs_item_offset(leaf, i); + btrfs_set_item_offset(leaf, i, ioff + dsize); } memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr); @@ -4591,7 +4522,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, * for possible call to btrfs_del_ptr below */ slot = path->slots[1]; - atomic_inc(&leaf->refs); + refcount_inc(&leaf->refs); /* * We want to be able to at least push one item to the * left neighbour leaf, and that's the first item. @@ -4649,16 +4580,13 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, /* * A helper function to walk down the tree starting at min_key, and looking - * for nodes or leaves that are have a minimum transaction id. + * for leaves that have a minimum transaction id. * This is used by the btree defrag code, and tree logging * * This does not cow, but it does stuff the starting key it finds back * into min_key, so you can call btrfs_search_slot with cow=1 on the * key and get a writable path. * - * This honors path->lowest_level to prevent descent past a given level - * of the tree. - * * min_trans indicates the oldest transaction that you are interested * in walking through. Any nodes or leaves older than min_trans are * skipped over (without reading them). @@ -4671,7 +4599,6 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, u64 min_trans) { struct extent_buffer *cur; - struct btrfs_key found_key; int slot; int sret; u32 nritems; @@ -4680,6 +4607,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, int keep_locks = path->keep_locks; ASSERT(!path->nowait); + ASSERT(path->lowest_level == 0); path->keep_locks = 1; again: cur = btrfs_read_lock_root_node(root); @@ -4701,13 +4629,14 @@ again: goto out; } - /* at the lowest level, we're done, setup the path and exit */ - if (level == path->lowest_level) { + /* At level 0 we're done, setup the path and exit. */ + if (level == 0) { if (slot >= nritems) goto find_next_key; ret = 0; path->slots[level] = slot; - btrfs_item_key_to_cpu(cur, &found_key, slot); + /* Save our key for returning back. */ + btrfs_item_key_to_cpu(cur, min_key, slot); goto out; } if (sret && slot > 0) @@ -4731,8 +4660,8 @@ find_next_key: * we didn't find a candidate key in this node, walk forward * and find another one */ + path->slots[level] = slot; if (slot >= nritems) { - path->slots[level] = slot; sret = btrfs_find_next_key(root, path, min_key, level, min_trans); if (sret == 0) { @@ -4742,13 +4671,6 @@ find_next_key: goto out; } } - /* save our key for returning back */ - btrfs_node_key_to_cpu(cur, &found_key, slot); - path->slots[level] = slot; - if (level == path->lowest_level) { - ret = 0; - goto out; - } cur = btrfs_read_node_slot(cur, slot); if (IS_ERR(cur)) { ret = PTR_ERR(cur); @@ -4763,10 +4685,8 @@ find_next_key: } out: path->keep_locks = keep_locks; - if (ret == 0) { - btrfs_unlock_up_safe(path, path->lowest_level + 1); - memcpy(min_key, &found_key, sizeof(found_key)); - } + if (ret == 0) + btrfs_unlock_up_safe(path, 1); return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 307dedf95c70..fe70b593c7cd 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -6,8 +6,7 @@ #ifndef BTRFS_CTREE_H #define BTRFS_CTREE_H -#include "linux/cleanup.h" -#include <linux/pagemap.h> +#include <linux/cleanup.h> #include <linux/spinlock.h> #include <linux/rbtree.h> #include <linux/mutex.h> @@ -62,7 +61,6 @@ struct btrfs_path { /* if there is real range locking, this locks field will change */ u8 locks[BTRFS_MAX_LEVEL]; u8 reada; - /* keep some upper locks as we walk down */ u8 lowest_level; /* @@ -70,6 +68,7 @@ struct btrfs_path { * and to force calls to keep space in the nodes */ unsigned int search_for_split:1; + /* Keep some upper locks as we walk down. */ unsigned int keep_locks:1; unsigned int skip_locking:1; unsigned int search_commit_root:1; @@ -225,16 +224,10 @@ struct btrfs_root { struct list_head root_list; - /* - * Xarray that keeps track of in-memory inodes, protected by the lock - * @inode_lock. - */ + /* Xarray that keeps track of in-memory inodes. */ struct xarray inodes; - /* - * Xarray that keeps track of delayed nodes of every inode, protected - * by @inode_lock. - */ + /* Xarray that keeps track of delayed nodes of every inode. */ struct xarray delayed_nodes; /* * right now this just gets used so that a root has its own devid @@ -371,6 +364,25 @@ static inline void btrfs_set_root_last_trans(struct btrfs_root *root, u64 transi } /* + * Return the generation this root started with. + * + * Every normal root that is created with root->root_key.offset set to it's + * originating generation. If it is a snapshot it is the generation when the + * snapshot was created. + * + * However for TREE_RELOC roots root_key.offset is the objectid of the owning + * tree root. Thankfully we copy the root item of the owning tree root, which + * has it's last_snapshot set to what we would have root_key.offset set to, so + * return that if this is a TREE_RELOC root. + */ +static inline u64 btrfs_root_origin_generation(const struct btrfs_root *root) +{ + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) + return btrfs_root_last_snapshot(&root->root_item); + return root->root_key.offset; +} + +/* * Structure that conveys information about an extent that is going to replace * all the extents in a file range. */ @@ -487,24 +499,10 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item); } -#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ - ((bytes) >> (fs_info)->sectorsize_bits) - -static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) -{ - return mapping_gfp_constraint(mapping, ~__GFP_FS); -} - -void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); -int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, u64 *actual_bytes); -int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); - -/* ctree.c */ int __init btrfs_ctree_init(void); void __cold btrfs_ctree_exit(void); -int btrfs_bin_search(struct extent_buffer *eb, int first_slot, +int btrfs_bin_search(const struct extent_buffer *eb, int first_slot, const struct btrfs_key *key, int *slot); int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); @@ -572,9 +570,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer **cow_ret, u64 new_root_objectid); -bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf); +bool btrfs_block_can_be_shared(const struct btrfs_trans_handle *trans, + const struct btrfs_root *root, + const struct extent_buffer *buf); int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); void btrfs_extend_item(struct btrfs_trans_handle *trans, @@ -723,13 +721,18 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) } int btrfs_leaf_free_space(const struct extent_buffer *leaf); -static inline int is_fstree(u64 rootid) +static inline bool btrfs_is_fstree(u64 rootid) { - if (rootid == BTRFS_FS_TREE_OBJECTID || - ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID && - !btrfs_qgroup_level(rootid))) - return 1; - return 0; + if (rootid == BTRFS_FS_TREE_OBJECTID) + return true; + + if ((s64)rootid < (s64)BTRFS_FIRST_FREE_OBJECTID) + return false; + + if (btrfs_qgroup_level(rootid) != 0) + return false; + + return true; } static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) @@ -737,18 +740,4 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; } -u16 btrfs_csum_type_size(u16 type); -int btrfs_super_csum_size(const struct btrfs_super_block *s); -const char *btrfs_super_csum_name(u16 csum_type); -const char *btrfs_super_csum_driver(u16 csum_type); -size_t __attribute_const__ btrfs_get_num_csums(void); - -/* - * We use folio flag owner_2 to indicate there is an ordered extent with - * unfinished IO. - */ -#define folio_test_ordered(folio) folio_test_owner_2(folio) -#define folio_set_ordered(folio) folio_set_owner_2(folio) -#define folio_clear_ordered(folio) folio_clear_owner_2(folio) - #endif diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 968dae953948..738179a5e170 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -60,6 +60,14 @@ static int compare_inode_defrag(const struct inode_defrag *defrag1, return 0; } +static int inode_defrag_cmp(struct rb_node *new, const struct rb_node *existing) +{ + const struct inode_defrag *new_defrag = rb_entry(new, struct inode_defrag, rb_node); + const struct inode_defrag *existing_defrag = rb_entry(existing, struct inode_defrag, rb_node); + + return compare_inode_defrag(new_defrag, existing_defrag); +} + /* * Insert a record for an inode into the defrag tree. The lock must be held * already. @@ -71,49 +79,35 @@ static int btrfs_insert_inode_defrag(struct btrfs_inode *inode, struct inode_defrag *defrag) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct inode_defrag *entry; - struct rb_node **p; - struct rb_node *parent = NULL; - int ret; + struct rb_node *node; - p = &fs_info->defrag_inodes.rb_node; - while (*p) { - parent = *p; - entry = rb_entry(parent, struct inode_defrag, rb_node); + node = rb_find_add(&defrag->rb_node, &fs_info->defrag_inodes, inode_defrag_cmp); + if (node) { + struct inode_defrag *entry; - ret = compare_inode_defrag(defrag, entry); - if (ret < 0) - p = &parent->rb_left; - else if (ret > 0) - p = &parent->rb_right; - else { - /* - * If we're reinserting an entry for an old defrag run, - * make sure to lower the transid of our existing - * record. - */ - if (defrag->transid < entry->transid) - entry->transid = defrag->transid; - entry->extent_thresh = min(defrag->extent_thresh, - entry->extent_thresh); - return -EEXIST; - } + entry = rb_entry(node, struct inode_defrag, rb_node); + /* + * If we're reinserting an entry for an old defrag run, make + * sure to lower the transid of our existing record. + */ + if (defrag->transid < entry->transid) + entry->transid = defrag->transid; + entry->extent_thresh = min(defrag->extent_thresh, entry->extent_thresh); + return -EEXIST; } set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); - rb_link_node(&defrag->rb_node, parent, p); - rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); return 0; } -static inline int need_auto_defrag(struct btrfs_fs_info *fs_info) +static inline bool need_auto_defrag(struct btrfs_fs_info *fs_info) { if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) - return 0; + return false; if (btrfs_fs_closing(fs_info)) - return 0; + return false; - return 1; + return true; } /* @@ -191,10 +185,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode( if (parent && compare_inode_defrag(&tmp, entry) > 0) { parent = rb_next(parent); - if (parent) - entry = rb_entry(parent, struct inode_defrag, rb_node); - else - entry = NULL; + entry = rb_entry_safe(parent, struct inode_defrag, rb_node); } out: if (entry) @@ -225,7 +216,7 @@ static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, struct file_ra_state *ra) { struct btrfs_root *inode_root; - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_ioctl_defrag_range_args range; int ret = 0; u64 cur = 0; @@ -250,24 +241,24 @@ again: goto cleanup; } - if (cur >= i_size_read(inode)) { - iput(inode); + if (cur >= i_size_read(&inode->vfs_inode)) { + iput(&inode->vfs_inode); goto cleanup; } /* Do a chunk of defrag */ - clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); + clear_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); memset(&range, 0, sizeof(range)); range.len = (u64)-1; range.start = cur; range.extent_thresh = defrag->extent_thresh; - file_ra_state_init(ra, inode->i_mapping); + file_ra_state_init(ra, inode->vfs_inode.i_mapping); sb_start_write(fs_info->sb); ret = btrfs_defrag_file(inode, ra, &range, defrag->transid, - BTRFS_DEFRAG_BATCH); + BTRFS_DEFRAG_BATCH); sb_end_write(fs_info->sb); - iput(inode); + iput(&inode->vfs_inode); if (ret < 0) goto cleanup; @@ -624,7 +615,7 @@ static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, u64 ino = btrfs_ino(inode); int ret; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { ret = -ENOMEM; goto err; @@ -734,12 +725,12 @@ next: not_found: btrfs_release_path(&path); - free_extent_map(em); + btrfs_free_extent_map(em); return NULL; err: btrfs_release_path(&path); - free_extent_map(em); + btrfs_free_extent_map(em); return ERR_PTR(ret); } @@ -756,7 +747,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, * full extent lock. */ read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, sectorsize); + em = btrfs_lookup_extent_mapping(em_tree, start, sectorsize); read_unlock(&em_tree->lock); /* @@ -769,7 +760,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, * file extent items in the inode's subvolume tree). */ if (em && (em->flags & EXTENT_FLAG_MERGED)) { - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; } @@ -779,10 +770,10 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, /* Get the big lock and read metadata off disk. */ if (!locked) - lock_extent(io_tree, start, end, &cached); + btrfs_lock_extent(io_tree, start, end, &cached); em = defrag_get_extent(BTRFS_I(inode), start, newer_than); if (!locked) - unlock_extent(io_tree, start, end, &cached); + btrfs_unlock_extent(io_tree, start, end, &cached); if (IS_ERR(em)) return NULL; @@ -794,7 +785,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, const struct extent_map *em) { - if (extent_map_is_compressed(em)) + if (btrfs_extent_map_is_compressed(em)) return BTRFS_MAX_COMPRESSED; return fs_info->max_extent_size; } @@ -837,7 +828,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, ret = true; out: - free_extent_map(next); + btrfs_free_extent_map(next); return ret; } @@ -857,13 +848,14 @@ static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t { struct address_space *mapping = inode->vfs_inode.i_mapping; gfp_t mask = btrfs_alloc_write_mask(mapping); - u64 page_start = (u64)index << PAGE_SHIFT; - u64 page_end = page_start + PAGE_SIZE - 1; + u64 lock_start; + u64 lock_end; struct extent_state *cached_state = NULL; struct folio *folio; int ret; again: + /* TODO: Add order fgp order flags when large folios are fully enabled. */ folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); if (IS_ERR(folio)) @@ -871,13 +863,16 @@ again: /* * Since we can defragment files opened read-only, we can encounter - * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We - * can't do I/O using huge pages yet, so return an error for now. + * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). + * + * The IO for such large folios is not fully tested, thus return + * an error to reject such folios unless it's an experimental build. + * * Filesystem transparent huge pages are typically only used for * executables that explicitly enable them, so this isn't very * restrictive. */ - if (folio_test_large(folio)) { + if (!IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && folio_test_large(folio)) { folio_unlock(folio); folio_put(folio); return ERR_PTR(-ETXTBSY); @@ -890,14 +885,15 @@ again: return ERR_PTR(ret); } + lock_start = folio_pos(folio); + lock_end = folio_end(folio) - 1; /* Wait for any existing ordered extent in the range */ while (1) { struct btrfs_ordered_extent *ordered; - lock_extent(&inode->io_tree, page_start, page_end, &cached_state); - ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); - unlock_extent(&inode->io_tree, page_start, page_end, - &cached_state); + btrfs_lock_extent(&inode->io_tree, lock_start, lock_end, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, lock_start, folio_size(folio)); + btrfs_unlock_extent(&inode->io_tree, lock_start, lock_end, &cached_state); if (!ordered) break; @@ -951,7 +947,7 @@ struct defrag_target_range { * @extent_thresh: file extent size threshold, any extent size >= this value * will be ignored * @newer_than: only defrag extents newer than this value - * @do_compress: whether the defrag is doing compression + * @do_compress: whether the defrag is doing compression or no-compression * if true, @extent_thresh will be ignored and all regular * file extents meeting @newer_than will be targets. * @locked: if the range has already held extent lock @@ -1027,8 +1023,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode, * very likely resulting in a larger extent after writeback is * triggered (except in a case of free space fragmentation). */ - if (test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1, - EXTENT_DELALLOC)) + if (btrfs_test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1, + EXTENT_DELALLOC)) goto next; /* @@ -1066,8 +1062,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode, /* Empty target list, no way to merge with last entry */ if (list_empty(target_list)) goto next; - last = list_entry(target_list->prev, - struct defrag_target_range, list); + last = list_last_entry(target_list, + struct defrag_target_range, list); /* Not mergeable with last entry */ if (last->start + last->len != cur) goto next; @@ -1077,7 +1073,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, add: last_is_target = true; - range_len = min(extent_map_end(em), start + len) - cur; + range_len = min(btrfs_extent_map_end(em), start + len) - cur; /* * This one is a good target, check if it can be merged into * last range of the target list. @@ -1085,8 +1081,8 @@ add: if (!list_empty(target_list)) { struct defrag_target_range *last; - last = list_entry(target_list->prev, - struct defrag_target_range, list); + last = list_last_entry(target_list, + struct defrag_target_range, list); ASSERT(last->start + last->len <= cur); if (last->start + last->len == cur) { /* Mergeable, enlarge the last entry */ @@ -1099,7 +1095,7 @@ add: /* Allocate new defrag_target_range */ new = kmalloc(sizeof(*new), GFP_NOFS); if (!new) { - free_extent_map(em); + btrfs_free_extent_map(em); ret = -ENOMEM; break; } @@ -1108,8 +1104,8 @@ add: list_add_tail(&new->list, target_list); next: - cur = extent_map_end(em); - free_extent_map(em); + cur = btrfs_extent_map_end(em); + btrfs_free_extent_map(em); } if (ret < 0) { struct defrag_target_range *entry; @@ -1162,27 +1158,30 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, struct extent_changeset *data_reserved = NULL; const u64 start = target->start; const u64 len = target->len; - unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; - unsigned long start_index = start >> PAGE_SHIFT; - unsigned long first_index = folios[0]->index; int ret = 0; - int i; - - ASSERT(last_index - first_index + 1 <= nr_pages); ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); if (ret < 0) return ret; - clear_extent_bit(&inode->io_tree, start, start + len - 1, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, cached_state); - set_extent_bit(&inode->io_tree, start, start + len - 1, - EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state); - - /* Update the page status */ - for (i = start_index - first_index; i <= last_index - first_index; i++) { - folio_clear_checked(folios[i]); - btrfs_folio_clamp_set_dirty(fs_info, folios[i], start, len); + btrfs_clear_extent_bit(&inode->io_tree, start, start + len - 1, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, cached_state); + btrfs_set_extent_bit(&inode->io_tree, start, start + len - 1, + EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state); + + /* + * Update the page status. + * Due to possible large folios, we have to check all folios one by one. + */ + for (int i = 0; i < nr_pages && folios[i]; i++) { + struct folio *folio = folios[i]; + + if (!folio) + break; + if (start >= folio_end(folio) || start + len <= folio_pos(folio)) + continue; + btrfs_folio_clamp_clear_checked(fs_info, folio, start, len); + btrfs_folio_clamp_set_dirty(fs_info, folio, start, len); } btrfs_delalloc_release_extents(inode, len); extent_changeset_free(data_reserved); @@ -1200,11 +1199,10 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, LIST_HEAD(target_list); struct folio **folios; const u32 sectorsize = inode->root->fs_info->sectorsize; - u64 last_index = (start + len - 1) >> PAGE_SHIFT; - u64 start_index = start >> PAGE_SHIFT; - unsigned int nr_pages = last_index - start_index + 1; + u64 cur = start; + const unsigned int nr_pages = ((start + len - 1) >> PAGE_SHIFT) - + (start >> PAGE_SHIFT) + 1; int ret = 0; - int i; ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); @@ -1214,21 +1212,25 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, return -ENOMEM; /* Prepare all pages */ - for (i = 0; i < nr_pages; i++) { - folios[i] = defrag_prepare_one_folio(inode, start_index + i); + for (int i = 0; cur < start + len && i < nr_pages; i++) { + folios[i] = defrag_prepare_one_folio(inode, cur >> PAGE_SHIFT); if (IS_ERR(folios[i])) { ret = PTR_ERR(folios[i]); - nr_pages = i; + folios[i] = NULL; goto free_folios; } + cur = folio_end(folios[i]); } - for (i = 0; i < nr_pages; i++) + for (int i = 0; i < nr_pages; i++) { + if (!folios[i]) + break; folio_wait_writeback(folios[i]); + } + /* We should get at least one folio. */ + ASSERT(folios[0]); /* Lock the pages range */ - lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, - (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, - &cached_state); + btrfs_lock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state); /* * Now we have a consistent view about the extent map, re-check * which range really needs to be defragged. @@ -1254,11 +1256,11 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, kfree(entry); } unlock_extent: - unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, - (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, - &cached_state); + btrfs_unlock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state); free_folios: - for (i = 0; i < nr_pages; i++) { + for (int i = 0; i < nr_pages; i++) { + if (!folios[i]) + break; folio_unlock(folios[i]); folio_put(folios[i]); } @@ -1352,17 +1354,19 @@ out: * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without * defragging all the range). */ -int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, +int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long sectors_defragged = 0; - u64 isize = i_size_read(inode); + u64 isize = i_size_read(&inode->vfs_inode); u64 cur; u64 last_byte; bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); + bool no_compress = (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS); int compress_type = BTRFS_COMPRESS_ZLIB; + int compress_level = 0; int ret = 0; u32 extent_thresh = range->extent_thresh; pgoff_t start_index; @@ -1376,10 +1380,24 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, return -EINVAL; if (do_compress) { - if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) - return -EINVAL; - if (range->compress_type) - compress_type = range->compress_type; + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS_LEVEL) { + if (range->compress.type >= BTRFS_NR_COMPRESS_TYPES) + return -EINVAL; + if (range->compress.type) { + compress_type = range->compress.type; + compress_level = range->compress.level; + if (!btrfs_compress_level_valid(compress_type, compress_level)) + return -EINVAL; + } + } else { + if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) + return -EINVAL; + if (range->compress_type) + compress_type = range->compress_type; + } + } else if (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS) { + compress_type = BTRFS_DEFRAG_DONT_COMPRESS; + compress_level = 1; } if (extent_thresh == 0) @@ -1402,8 +1420,8 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, * defrag range can be written sequentially. */ start_index = cur >> PAGE_SHIFT; - if (start_index < inode->i_mapping->writeback_index) - inode->i_mapping->writeback_index = start_index; + if (start_index < inode->vfs_inode.i_mapping->writeback_index) + inode->vfs_inode.i_mapping->writeback_index = start_index; while (cur < last_byte) { const unsigned long prev_sectors_defragged = sectors_defragged; @@ -1420,27 +1438,30 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; cluster_end = min(cluster_end, last_byte); - btrfs_inode_lock(BTRFS_I(inode), 0); - if (IS_SWAPFILE(inode)) { + btrfs_inode_lock(inode, 0); + if (IS_SWAPFILE(&inode->vfs_inode)) { ret = -ETXTBSY; - btrfs_inode_unlock(BTRFS_I(inode), 0); + btrfs_inode_unlock(inode, 0); break; } - if (!(inode->i_sb->s_flags & SB_ACTIVE)) { - btrfs_inode_unlock(BTRFS_I(inode), 0); + if (!(inode->vfs_inode.i_sb->s_flags & SB_ACTIVE)) { + btrfs_inode_unlock(inode, 0); break; } - if (do_compress) - BTRFS_I(inode)->defrag_compress = compress_type; - ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, + if (do_compress || no_compress) { + inode->defrag_compress = compress_type; + inode->defrag_compress_level = compress_level; + } + ret = defrag_one_cluster(inode, ra, cur, cluster_end + 1 - cur, extent_thresh, - newer_than, do_compress, §ors_defragged, + newer_than, do_compress || no_compress, + §ors_defragged, max_to_defrag, &last_scanned); if (sectors_defragged > prev_sectors_defragged) - balance_dirty_pages_ratelimited(inode->i_mapping); + balance_dirty_pages_ratelimited(inode->vfs_inode.i_mapping); - btrfs_inode_unlock(BTRFS_I(inode), 0); + btrfs_inode_unlock(inode, 0); if (ret < 0) break; cur = max(cluster_end + 1, last_scanned); @@ -1462,10 +1483,10 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, * need to be written back immediately. */ if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { - filemap_flush(inode->i_mapping); + filemap_flush(inode->vfs_inode.i_mapping); if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_flush(inode->i_mapping); + &inode->runtime_flags)) + filemap_flush(inode->vfs_inode.i_mapping); } if (range->compress_type == BTRFS_COMPRESS_LZO) btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); @@ -1473,10 +1494,10 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); ret = sectors_defragged; } - if (do_compress) { - btrfs_inode_lock(BTRFS_I(inode), 0); - BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; - btrfs_inode_unlock(BTRFS_I(inode), 0); + if (do_compress || no_compress) { + btrfs_inode_lock(inode, 0); + inode->defrag_compress = BTRFS_COMPRESS_NONE; + btrfs_inode_unlock(inode, 0); } return ret; } diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h index 6b7596c4f0dc..a7f917a38dbf 100644 --- a/fs/btrfs/defrag.h +++ b/fs/btrfs/defrag.h @@ -6,14 +6,14 @@ #include <linux/types.h> #include <linux/compiler_types.h> -struct inode; struct file_ra_state; +struct btrfs_inode; struct btrfs_fs_info; struct btrfs_root; struct btrfs_trans_handle; struct btrfs_ioctl_defrag_range_args; -int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, +int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag); int __init btrfs_auto_defrag_init(void); diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 7aa8a395d838..288e1776c02d 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -111,6 +111,18 @@ * making error handling and cleanup easier. */ +static inline struct btrfs_space_info *data_sinfo_for_inode(const struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + if (btrfs_is_zoned(fs_info) && btrfs_is_data_reloc_root(inode->root)) { + ASSERT(fs_info->data_sinfo->sub_group[0]->subgroup_id == + BTRFS_SUB_GROUP_DATA_RELOC); + return fs_info->data_sinfo->sub_group[0]; + } + return fs_info->data_sinfo; +} + int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes) { struct btrfs_root *root = inode->root; @@ -123,7 +135,7 @@ int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes) if (btrfs_is_free_space_inode(inode)) flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE; - return btrfs_reserve_data_bytes(fs_info, bytes, flush); + return btrfs_reserve_data_bytes(data_sinfo_for_inode(inode), bytes, flush); } int btrfs_check_data_free_space(struct btrfs_inode *inode, @@ -144,14 +156,14 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode, else if (btrfs_is_free_space_inode(inode)) flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE; - ret = btrfs_reserve_data_bytes(fs_info, len, flush); + ret = btrfs_reserve_data_bytes(data_sinfo_for_inode(inode), len, flush); if (ret < 0) return ret; /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); if (ret < 0) { - btrfs_free_reserved_data_space_noquota(fs_info, len); + btrfs_free_reserved_data_space_noquota(inode, len); extent_changeset_free(*reserved); *reserved = NULL; } else { @@ -168,15 +180,13 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode, * which we can't sleep and is sure it won't affect qgroup reserved space. * Like clear_bit_hook(). */ -void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, - u64 len) +void btrfs_free_reserved_data_space_noquota(struct btrfs_inode *inode, u64 len) { - struct btrfs_space_info *data_sinfo; + struct btrfs_fs_info *fs_info = inode->root->fs_info; ASSERT(IS_ALIGNED(len, fs_info->sectorsize)); - data_sinfo = fs_info->data_sinfo; - btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len); + btrfs_space_info_free_bytes_may_use(data_sinfo_for_inode(inode), len); } /* @@ -196,7 +206,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode, round_down(start, fs_info->sectorsize); start = round_down(start, fs_info->sectorsize); - btrfs_free_reserved_data_space_noquota(fs_info, len); + btrfs_free_reserved_data_space_noquota(inode, len); btrfs_qgroup_free_data(inode, reserved, start, len, NULL); } @@ -439,6 +449,29 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) btrfs_inode_rsv_release(inode, true); } +/* Shrink a previously reserved extent to a new length. */ +void btrfs_delalloc_shrink_extents(struct btrfs_inode *inode, u64 reserved_len, u64 new_len) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 reserved_num_extents = count_max_extents(fs_info, reserved_len); + const u32 new_num_extents = count_max_extents(fs_info, new_len); + const int diff_num_extents = new_num_extents - reserved_num_extents; + + ASSERT(new_len <= reserved_len); + if (new_num_extents == reserved_num_extents) + return; + + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, diff_num_extents); + btrfs_calculate_inode_block_rsv_size(fs_info, inode); + spin_unlock(&inode->lock); + + if (btrfs_is_testing(fs_info)) + return; + + btrfs_inode_rsv_release(inode, true); +} + /* * Reserve data and metadata space for delalloc * diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h index 3f32953c0a80..6119c0d3f883 100644 --- a/fs/btrfs/delalloc-space.h +++ b/fs/btrfs/delalloc-space.h @@ -18,8 +18,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode, void btrfs_delalloc_release_space(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, bool qgroup_free); -void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, - u64 len); +void btrfs_free_reserved_data_space_noquota(struct btrfs_inode *inode, u64 len); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, bool qgroup_free); int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, @@ -27,5 +26,6 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, u64 disk_num_bytes, bool noflush); void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); +void btrfs_delalloc_shrink_extents(struct btrfs_inode *inode, u64 reserved_len, u64 new_len); #endif /* BTRFS_DELALLOC_SPACE_H */ diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 508bdbae29a0..0f8d8e275143 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -119,7 +119,12 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( return NULL; } -/* Will return either the node or PTR_ERR(-ENOMEM) */ +/* + * Look up an existing delayed node associated with @btrfs_inode or create a new + * one and insert it to the delayed nodes of the root. + * + * Return the delayed node, or error pointer on failure. + */ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( struct btrfs_inode *btrfs_inode) { @@ -211,17 +216,13 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, static struct btrfs_delayed_node *btrfs_first_delayed_node( struct btrfs_delayed_root *delayed_root) { - struct list_head *p; - struct btrfs_delayed_node *node = NULL; + struct btrfs_delayed_node *node; spin_lock(&delayed_root->lock); - if (list_empty(&delayed_root->node_list)) - goto out; - - p = delayed_root->node_list.next; - node = list_entry(p, struct btrfs_delayed_node, n_list); - refcount_inc(&node->refs); -out: + node = list_first_entry_or_null(&delayed_root->node_list, + struct btrfs_delayed_node, n_list); + if (node) + refcount_inc(&node->refs); spin_unlock(&delayed_root->lock); return node; @@ -293,18 +294,15 @@ static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node) static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( struct btrfs_delayed_root *delayed_root) { - struct list_head *p; - struct btrfs_delayed_node *node = NULL; + struct btrfs_delayed_node *node; spin_lock(&delayed_root->lock); - if (list_empty(&delayed_root->prepare_list)) - goto out; - - p = delayed_root->prepare_list.next; - list_del_init(p); - node = list_entry(p, struct btrfs_delayed_node, p_list); - refcount_inc(&node->refs); -out: + node = list_first_entry_or_null(&delayed_root->prepare_list, + struct btrfs_delayed_node, p_list); + if (node) { + list_del_init(&node->p_list); + refcount_inc(&node->refs); + } spin_unlock(&delayed_root->lock); return node; @@ -336,6 +334,20 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len, return item; } +static int delayed_item_index_cmp(const void *key, const struct rb_node *node) +{ + const u64 *index = key; + const struct btrfs_delayed_item *delayed_item = rb_entry(node, + struct btrfs_delayed_item, rb_node); + + if (delayed_item->index < *index) + return 1; + else if (delayed_item->index > *index) + return -1; + + return 0; +} + /* * Look up the delayed item by key. * @@ -349,57 +361,35 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_item( struct rb_root *root, u64 index) { - struct rb_node *node = root->rb_node; - struct btrfs_delayed_item *delayed_item = NULL; + struct rb_node *node; - while (node) { - delayed_item = rb_entry(node, struct btrfs_delayed_item, - rb_node); - if (delayed_item->index < index) - node = node->rb_right; - else if (delayed_item->index > index) - node = node->rb_left; - else - return delayed_item; - } + node = rb_find(&index, root, delayed_item_index_cmp); + return rb_entry_safe(node, struct btrfs_delayed_item, rb_node); +} - return NULL; +static int btrfs_delayed_item_cmp(const struct rb_node *new, + const struct rb_node *exist) +{ + const struct btrfs_delayed_item *new_item = + rb_entry(new, struct btrfs_delayed_item, rb_node); + + return delayed_item_index_cmp(&new_item->index, exist); } static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, struct btrfs_delayed_item *ins) { - struct rb_node **p, *node; - struct rb_node *parent_node = NULL; struct rb_root_cached *root; - struct btrfs_delayed_item *item; - bool leftmost = true; + struct rb_node *exist; if (ins->type == BTRFS_DELAYED_INSERTION_ITEM) root = &delayed_node->ins_root; else root = &delayed_node->del_root; - p = &root->rb_root.rb_node; - node = &ins->rb_node; - - while (*p) { - parent_node = *p; - item = rb_entry(parent_node, struct btrfs_delayed_item, - rb_node); - - if (item->index < ins->index) { - p = &(*p)->rb_right; - leftmost = false; - } else if (item->index > ins->index) { - p = &(*p)->rb_left; - } else { - return -EEXIST; - } - } - - rb_link_node(node, parent_node, p); - rb_insert_color_cached(node, root, leftmost); + exist = rb_find_add_cached(&ins->rb_node, root, btrfs_delayed_item_cmp); + if (exist) + return -EEXIST; if (ins->type == BTRFS_DELAYED_INSERTION_ITEM && ins->index >= delayed_node->index_cnt) @@ -459,40 +449,25 @@ static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item( struct btrfs_delayed_node *delayed_node) { - struct rb_node *p; - struct btrfs_delayed_item *item = NULL; - - p = rb_first_cached(&delayed_node->ins_root); - if (p) - item = rb_entry(p, struct btrfs_delayed_item, rb_node); + struct rb_node *p = rb_first_cached(&delayed_node->ins_root); - return item; + return rb_entry_safe(p, struct btrfs_delayed_item, rb_node); } static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item( struct btrfs_delayed_node *delayed_node) { - struct rb_node *p; - struct btrfs_delayed_item *item = NULL; - - p = rb_first_cached(&delayed_node->del_root); - if (p) - item = rb_entry(p, struct btrfs_delayed_item, rb_node); + struct rb_node *p = rb_first_cached(&delayed_node->del_root); - return item; + return rb_entry_safe(p, struct btrfs_delayed_item, rb_node); } static struct btrfs_delayed_item *__btrfs_next_delayed_item( struct btrfs_delayed_item *item) { - struct rb_node *p; - struct btrfs_delayed_item *next = NULL; + struct rb_node *p = rb_next(&item->rb_node); - p = rb_next(&item->rb_node); - if (p) - next = rb_entry(p, struct btrfs_delayed_item, rb_node); - - return next; + return rb_entry_safe(p, struct btrfs_delayed_item, rb_node); } static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, @@ -1030,15 +1005,22 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, ret = btrfs_lookup_inode(trans, root, path, &key, mod); if (ret > 0) ret = -ENOENT; - if (ret < 0) + if (ret < 0) { + /* + * If we fail to update the delayed inode we need to abort the + * transaction, because we could leave the inode with the + * improper counts behind. + */ + if (ret != -ENOENT) + btrfs_abort_transaction(trans, ret); goto out; + } leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item, sizeof(struct btrfs_inode_item)); - btrfs_mark_buffer_dirty(trans, leaf); if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) goto out; @@ -1057,8 +1039,10 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, btrfs_release_path(path); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) + if (ret < 0) { + btrfs_abort_transaction(trans, ret); goto err_out; + } ASSERT(ret > 0); ASSERT(path->slots[0] > 0); ret = 0; @@ -1080,21 +1064,14 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, * in the same item doesn't exist. */ ret = btrfs_del_item(trans, root, path); + if (ret < 0) + btrfs_abort_transaction(trans, ret); out: btrfs_release_delayed_iref(node); btrfs_release_path(path); err_out: btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0)); btrfs_release_delayed_inode(node); - - /* - * If we fail to update the delayed inode we need to abort the - * transaction, because we could leave the inode with the improper - * counts behind. - */ - if (ret && ret != -ENOENT) - btrfs_abort_transaction(trans, ret); - return ret; } @@ -1217,7 +1194,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_block_rsv *block_rsv; int ret; @@ -1244,7 +1221,6 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node); btrfs_release_delayed_node(delayed_node); - btrfs_free_path(path); trans->block_rsv = block_rsv; return ret; @@ -1401,20 +1377,23 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info) { - WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root)); + struct btrfs_delayed_node *node = btrfs_first_delayed_node(fs_info->delayed_root); + + if (WARN_ON(node)) + refcount_dec(&node->refs); } -static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) +static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) { int val = atomic_read(&delayed_root->items_seq); if (val < seq || val >= seq + BTRFS_DELAYED_BATCH) - return 1; + return true; if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) - return 1; + return true; - return 0; + return false; } void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) @@ -1561,9 +1540,8 @@ release_node: return ret; } -static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_node *node, - u64 index) +static bool btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node, + u64 index) { struct btrfs_delayed_item *item; @@ -1571,7 +1549,7 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info, item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index); if (!item) { mutex_unlock(&node->mutex); - return 1; + return false; } /* @@ -1606,7 +1584,7 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info, } mutex_unlock(&node->mutex); - return 0; + return true; } int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, @@ -1620,9 +1598,10 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, if (IS_ERR(node)) return PTR_ERR(node); - ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index); - if (!ret) + if (btrfs_delete_delayed_insertion_item(node, index)) { + ret = 0; goto end; + } item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM); if (!item) { @@ -1639,7 +1618,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, */ if (ret < 0) { btrfs_err(trans->fs_info, -"metadata reservation failed for delayed dir item deltiona, should have been reserved"); +"metadata reservation failed for delayed dir item deletion, index: %llu, root: %llu, inode: %llu, error: %d", + index, btrfs_root_id(node->root), node->inode_id, ret); btrfs_release_delayed_item(item); goto end; } @@ -1648,9 +1628,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, ret = __btrfs_add_delayed_item(node, item); if (unlikely(ret)) { btrfs_err(trans->fs_info, - "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", - index, btrfs_root_id(node->root), - node->inode_id, ret); +"failed to add delayed dir index item, root: %llu, inode: %llu, index: %llu, error: %d", + index, btrfs_root_id(node->root), node->inode_id, ret); btrfs_delayed_item_release_metadata(dir->root, item); btrfs_release_delayed_item(item); } @@ -1755,17 +1734,16 @@ void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode, downgrade_write(&inode->vfs_inode.i_rwsem); } -int btrfs_should_delete_dir_index(const struct list_head *del_list, - u64 index) +bool btrfs_should_delete_dir_index(const struct list_head *del_list, u64 index) { struct btrfs_delayed_item *curr; - int ret = 0; + bool ret = false; list_for_each_entry(curr, del_list, readdir_list) { if (curr->index > index) break; if (curr->index == index) { - ret = 1; + ret = true; break; } } @@ -1775,15 +1753,14 @@ int btrfs_should_delete_dir_index(const struct list_head *del_list, /* * Read dir info stored in the delayed tree. */ -int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, - const struct list_head *ins_list) +bool btrfs_readdir_delayed_dir_index(struct dir_context *ctx, + const struct list_head *ins_list) { struct btrfs_dir_item *di; struct btrfs_delayed_item *curr, *next; struct btrfs_key location; char *name; int name_len; - int over = 0; unsigned char d_type; /* @@ -1792,6 +1769,8 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, * directory, nobody can delete any directory indexes now. */ list_for_each_entry_safe(curr, next, ins_list, readdir_list) { + bool over; + list_del(&curr->readdir_list); if (curr->index < ctx->pos) { @@ -1809,68 +1788,67 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, d_type = fs_ftype_to_dtype(btrfs_dir_flags_to_ftype(di->type)); btrfs_disk_key_to_cpu(&location, &di->location); - over = !dir_emit(ctx, name, name_len, - location.objectid, d_type); + over = !dir_emit(ctx, name, name_len, location.objectid, d_type); if (refcount_dec_and_test(&curr->refs)) kfree(curr); if (over) - return 1; + return true; ctx->pos++; } - return 0; + return false; } static void fill_stack_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *inode_item, - struct inode *inode) + struct btrfs_inode *inode) { + struct inode *vfs_inode = &inode->vfs_inode; u64 flags; - btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode)); - btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode)); - btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size); - btrfs_set_stack_inode_mode(inode_item, inode->i_mode); - btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink); - btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode)); - btrfs_set_stack_inode_generation(inode_item, - BTRFS_I(inode)->generation); + btrfs_set_stack_inode_uid(inode_item, i_uid_read(vfs_inode)); + btrfs_set_stack_inode_gid(inode_item, i_gid_read(vfs_inode)); + btrfs_set_stack_inode_size(inode_item, inode->disk_i_size); + btrfs_set_stack_inode_mode(inode_item, vfs_inode->i_mode); + btrfs_set_stack_inode_nlink(inode_item, vfs_inode->i_nlink); + btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(vfs_inode)); + btrfs_set_stack_inode_generation(inode_item, inode->generation); btrfs_set_stack_inode_sequence(inode_item, - inode_peek_iversion(inode)); + inode_peek_iversion(vfs_inode)); btrfs_set_stack_inode_transid(inode_item, trans->transid); - btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); - flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, - BTRFS_I(inode)->ro_flags); + btrfs_set_stack_inode_rdev(inode_item, vfs_inode->i_rdev); + flags = btrfs_inode_combine_flags(inode->flags, inode->ro_flags); btrfs_set_stack_inode_flags(inode_item, flags); btrfs_set_stack_inode_block_group(inode_item, 0); btrfs_set_stack_timespec_sec(&inode_item->atime, - inode_get_atime_sec(inode)); + inode_get_atime_sec(vfs_inode)); btrfs_set_stack_timespec_nsec(&inode_item->atime, - inode_get_atime_nsec(inode)); + inode_get_atime_nsec(vfs_inode)); btrfs_set_stack_timespec_sec(&inode_item->mtime, - inode_get_mtime_sec(inode)); + inode_get_mtime_sec(vfs_inode)); btrfs_set_stack_timespec_nsec(&inode_item->mtime, - inode_get_mtime_nsec(inode)); + inode_get_mtime_nsec(vfs_inode)); btrfs_set_stack_timespec_sec(&inode_item->ctime, - inode_get_ctime_sec(inode)); + inode_get_ctime_sec(vfs_inode)); btrfs_set_stack_timespec_nsec(&inode_item->ctime, - inode_get_ctime_nsec(inode)); + inode_get_ctime_nsec(vfs_inode)); - btrfs_set_stack_timespec_sec(&inode_item->otime, BTRFS_I(inode)->i_otime_sec); - btrfs_set_stack_timespec_nsec(&inode_item->otime, BTRFS_I(inode)->i_otime_nsec); + btrfs_set_stack_timespec_sec(&inode_item->otime, inode->i_otime_sec); + btrfs_set_stack_timespec_nsec(&inode_item->otime, inode->i_otime_nsec); } -int btrfs_fill_inode(struct inode *inode, u32 *rdev) +int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) { - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_delayed_node *delayed_node; struct btrfs_inode_item *inode_item; + struct inode *vfs_inode = &inode->vfs_inode; - delayed_node = btrfs_get_delayed_node(BTRFS_I(inode)); + delayed_node = btrfs_get_delayed_node(inode); if (!delayed_node) return -ENOENT; @@ -1883,39 +1861,38 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) inode_item = &delayed_node->inode_item; - i_uid_write(inode, btrfs_stack_inode_uid(inode_item)); - i_gid_write(inode, btrfs_stack_inode_gid(inode_item)); - btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item)); - btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, - round_up(i_size_read(inode), fs_info->sectorsize)); - inode->i_mode = btrfs_stack_inode_mode(inode_item); - set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); - inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); - BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); - BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item); - - inode_set_iversion_queried(inode, - btrfs_stack_inode_sequence(inode_item)); - inode->i_rdev = 0; + i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item)); + i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item)); + btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); + btrfs_inode_set_file_extent_range(inode, 0, + round_up(i_size_read(vfs_inode), fs_info->sectorsize)); + vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item); + set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item)); + inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item)); + inode->generation = btrfs_stack_inode_generation(inode_item); + inode->last_trans = btrfs_stack_inode_transid(inode_item); + + inode_set_iversion_queried(vfs_inode, btrfs_stack_inode_sequence(inode_item)); + vfs_inode->i_rdev = 0; *rdev = btrfs_stack_inode_rdev(inode_item); btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item), - &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); + &inode->flags, &inode->ro_flags); - inode_set_atime(inode, btrfs_stack_timespec_sec(&inode_item->atime), + inode_set_atime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->atime), btrfs_stack_timespec_nsec(&inode_item->atime)); - inode_set_mtime(inode, btrfs_stack_timespec_sec(&inode_item->mtime), + inode_set_mtime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->mtime), btrfs_stack_timespec_nsec(&inode_item->mtime)); - inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime), + inode_set_ctime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->ctime), btrfs_stack_timespec_nsec(&inode_item->ctime)); - BTRFS_I(inode)->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime); - BTRFS_I(inode)->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime); + inode->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime); + inode->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime); - inode->i_generation = BTRFS_I(inode)->generation; - if (S_ISDIR(inode->i_mode)) - BTRFS_I(inode)->index_cnt = (u64)-1; + vfs_inode->i_generation = inode->generation; + if (S_ISDIR(vfs_inode->i_mode)) + inode->index_cnt = (u64)-1; mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_node(delayed_node); @@ -1935,8 +1912,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, mutex_lock(&delayed_node->mutex); if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { - fill_stack_inode_item(trans, &delayed_node->inode_item, - &inode->vfs_inode); + fill_stack_inode_item(trans, &delayed_node->inode_item, inode); goto release_node; } @@ -1944,7 +1920,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, if (ret) goto release_node; - fill_stack_inode_item(trans, &delayed_node->inode_item, &inode->vfs_inode); + fill_stack_inode_item(trans, &delayed_node->inode_item, inode); set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); delayed_node->count++; atomic_inc(&root->fs_info->delayed_root->items); diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index f4d9feac0d0e..e6e763ad2d42 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -133,7 +133,7 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode); int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); -int btrfs_fill_inode(struct inode *inode, u32 *rdev); +int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev); int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode); /* Used for drop dead root */ @@ -150,10 +150,9 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode, void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode, struct list_head *ins_list, struct list_head *del_list); -int btrfs_should_delete_dir_index(const struct list_head *del_list, - u64 index); -int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, - const struct list_head *ins_list); +bool btrfs_should_delete_dir_index(const struct list_head *del_list, u64 index); +bool btrfs_readdir_delayed_dir_index(struct dir_context *ctx, + const struct list_head *ins_list); /* Used during directory logging. */ void btrfs_log_get_delayed_items(struct btrfs_inode *inode, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 0d878dbbabba..ca382c5b186f 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -93,6 +93,9 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) u64 num_bytes; u64 reserved_bytes; + if (btrfs_is_testing(fs_info)) + return; + num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates); num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, trans->delayed_ref_csum_deletions); @@ -254,7 +257,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, spin_unlock(&block_rsv->lock); if (to_free > 0) - btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free); + btrfs_space_info_free_bytes_may_use(space_info, to_free); if (refilled_bytes > 0) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, @@ -265,8 +268,8 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, /* * compare two delayed data backrefs with same bytenr and type */ -static int comp_data_refs(struct btrfs_delayed_ref_node *ref1, - struct btrfs_delayed_ref_node *ref2) +static int comp_data_refs(const struct btrfs_delayed_ref_node *ref1, + const struct btrfs_delayed_ref_node *ref2) { if (ref1->data_ref.objectid < ref2->data_ref.objectid) return -1; @@ -279,8 +282,8 @@ static int comp_data_refs(struct btrfs_delayed_ref_node *ref1, return 0; } -static int comp_refs(struct btrfs_delayed_ref_node *ref1, - struct btrfs_delayed_ref_node *ref2, +static int comp_refs(const struct btrfs_delayed_ref_node *ref1, + const struct btrfs_delayed_ref_node *ref2, bool check_seq) { int ret = 0; @@ -314,35 +317,23 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1, return 0; } +static int cmp_refs_node(const struct rb_node *new, const struct rb_node *exist) +{ + const struct btrfs_delayed_ref_node *new_node = + rb_entry(new, struct btrfs_delayed_ref_node, ref_node); + const struct btrfs_delayed_ref_node *exist_node = + rb_entry(exist, struct btrfs_delayed_ref_node, ref_node); + + return comp_refs(new_node, exist_node, true); +} + static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, struct btrfs_delayed_ref_node *ins) { - struct rb_node **p = &root->rb_root.rb_node; struct rb_node *node = &ins->ref_node; - struct rb_node *parent_node = NULL; - struct btrfs_delayed_ref_node *entry; - bool leftmost = true; - - while (*p) { - int comp; - - parent_node = *p; - entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, - ref_node); - comp = comp_refs(ins, entry, true); - if (comp < 0) { - p = &(*p)->rb_left; - } else if (comp > 0) { - p = &(*p)->rb_right; - leftmost = false; - } else { - return entry; - } - } + struct rb_node *exist = rb_find_add_cached(node, root, cmp_refs_node); - rb_link_node(node, parent_node, p); - rb_insert_color_cached(node, root, leftmost); - return NULL; + return rb_entry_safe(exist, struct btrfs_delayed_ref_node, ref_node); } static struct btrfs_delayed_ref_head *find_first_ref_head( @@ -555,6 +546,32 @@ void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info, delayed_refs->num_heads_ready--; } +struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_ref_node *ref; + + lockdep_assert_held(&head->mutex); + lockdep_assert_held(&head->lock); + + if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) + return NULL; + + /* + * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. + * This is to prevent a ref count from going down to zero, which deletes + * the extent item from the extent tree, when there still are references + * to add, which would fail because they would not find the extent item. + */ + if (!list_empty(&head->ref_add_list)) + return list_first_entry(&head->ref_add_list, + struct btrfs_delayed_ref_node, add_list); + + ref = rb_entry(rb_first_cached(&head->ref_tree), + struct btrfs_delayed_ref_node, ref_node); + ASSERT(list_empty(&ref->add_list)); + return ref; +} + /* * Helper to insert the ref_node to the tail or merge with tail. * @@ -911,7 +928,7 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; - if (is_fstree(generic_ref->ref_root)) + if (btrfs_is_fstree(generic_ref->ref_root)) seq = atomic64_read(&fs_info->tree_mod_seq); refcount_set(&ref->refs, 1); @@ -941,8 +958,8 @@ void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, #endif generic_ref->tree_ref.level = level; generic_ref->type = BTRFS_REF_METADATA; - if (skip_qgroup || !(is_fstree(generic_ref->ref_root) && - (!mod_root || is_fstree(mod_root)))) + if (skip_qgroup || !(btrfs_is_fstree(generic_ref->ref_root) && + (!mod_root || btrfs_is_fstree(mod_root)))) generic_ref->skip_qgroup = true; else generic_ref->skip_qgroup = false; @@ -959,8 +976,8 @@ void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset, generic_ref->data_ref.objectid = ino; generic_ref->data_ref.offset = offset; generic_ref->type = BTRFS_REF_DATA; - if (skip_qgroup || !(is_fstree(generic_ref->ref_root) && - (!mod_root || is_fstree(mod_root)))) + if (skip_qgroup || !(btrfs_is_fstree(generic_ref->ref_root) && + (!mod_root || btrfs_is_fstree(mod_root)))) generic_ref->skip_qgroup = true; else generic_ref->skip_qgroup = false; @@ -1234,6 +1251,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) { struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs; struct btrfs_fs_info *fs_info = trans->fs_info; + bool testing = btrfs_is_testing(fs_info); spin_lock(&delayed_refs->lock); while (true) { @@ -1263,7 +1281,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) spin_unlock(&delayed_refs->lock); mutex_unlock(&head->mutex); - if (pin_bytes) { + if (!testing && pin_bytes) { struct btrfs_block_group *bg; bg = btrfs_lookup_block_group(fs_info, head->bytenr); @@ -1281,8 +1299,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) spin_lock(&bg->space_info->lock); spin_lock(&bg->lock); bg->pinned += head->num_bytes; - btrfs_space_info_update_bytes_pinned(fs_info, - bg->space_info, + btrfs_space_info_update_bytes_pinned(bg->space_info, head->num_bytes); bg->reserved -= head->num_bytes; bg->space_info->bytes_reserved -= head->num_bytes; @@ -1295,12 +1312,15 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) btrfs_error_unpin_extent_range(fs_info, head->bytenr, head->bytenr + head->num_bytes - 1); } - btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); + if (!testing) + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); btrfs_put_delayed_ref_head(head); cond_resched(); spin_lock(&delayed_refs->lock); } - btrfs_qgroup_destroy_extent_records(trans); + + if (!testing) + btrfs_qgroup_destroy_extent_records(trans); spin_unlock(&delayed_refs->lock); } @@ -1316,7 +1336,7 @@ int __init btrfs_delayed_ref_init(void) { btrfs_delayed_ref_head_cachep = KMEM_CACHE(btrfs_delayed_ref_head, 0); if (!btrfs_delayed_ref_head_cachep) - goto fail; + return -ENOMEM; btrfs_delayed_ref_node_cachep = KMEM_CACHE(btrfs_delayed_ref_node, 0); if (!btrfs_delayed_ref_node_cachep) diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 611fb3388f82..552ec4fa645d 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -14,6 +14,8 @@ #include <linux/spinlock.h> #include <linux/slab.h> #include <uapi/linux/btrfs_tree.h> +#include "fs.h" +#include "messages.h" struct btrfs_trans_handle; struct btrfs_fs_info; @@ -260,7 +262,6 @@ enum btrfs_ref_type { BTRFS_REF_NOT_SET, BTRFS_REF_DATA, BTRFS_REF_METADATA, - BTRFS_REF_LAST, } __packed; struct btrfs_ref { @@ -402,6 +403,7 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head( struct btrfs_delayed_ref_root *delayed_refs); void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); +struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head); int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); @@ -418,7 +420,7 @@ bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, u64 root, u64 parent); void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans); -static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node) +static inline u64 btrfs_delayed_ref_owner(const struct btrfs_delayed_ref_node *node) { if (node->type == BTRFS_EXTENT_DATA_REF_KEY || node->type == BTRFS_SHARED_DATA_REF_KEY) @@ -426,7 +428,7 @@ static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node) return node->tree_ref.level; } -static inline u64 btrfs_delayed_ref_offset(struct btrfs_delayed_ref_node *node) +static inline u64 btrfs_delayed_ref_offset(const struct btrfs_delayed_ref_node *node) { if (node->type == BTRFS_EXTENT_DATA_REF_KEY || node->type == BTRFS_SHARED_DATA_REF_KEY) @@ -434,7 +436,7 @@ static inline u64 btrfs_delayed_ref_offset(struct btrfs_delayed_ref_node *node) return 0; } -static inline u8 btrfs_ref_type(struct btrfs_ref *ref) +static inline u8 btrfs_ref_type(const struct btrfs_ref *ref) { ASSERT(ref->type == BTRFS_REF_DATA || ref->type == BTRFS_REF_METADATA); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index ac8e97ed13f7..4675bcd5f92e 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -76,7 +76,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) struct extent_buffer *eb; int slot; int ret = 0; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); int item_size; struct btrfs_dev_replace_item *ptr; u64 src_devid; @@ -85,10 +85,8 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) return 0; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; key.objectid = 0; key.type = BTRFS_DEV_REPLACE_KEY; @@ -103,10 +101,8 @@ no_valid_dev_replace_entry_found: if (btrfs_find_device(fs_info->fs_devices, &args)) { btrfs_err(fs_info, "found replace target device without a valid replace item"); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } - ret = 0; dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; dev_replace->cont_reading_from_srcdev_mode = @@ -123,7 +119,7 @@ no_valid_dev_replace_entry_found: dev_replace->tgtdev = NULL; dev_replace->is_valid = 0; dev_replace->item_needs_writeback = 0; - goto out; + return 0; } slot = path->slots[0]; eb = path->nodes[0]; @@ -226,8 +222,6 @@ no_valid_dev_replace_entry_found: break; } -out: - btrfs_free_path(path); return ret; } @@ -256,7 +250,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, } bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, - fs_info->bdev_holder, NULL); + fs_info->sb, &fs_holder_ops); if (IS_ERR(bdev_file)) { btrfs_err(fs_info, "target device %s is invalid!", device_path); return PTR_ERR(bdev_file); @@ -333,7 +327,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return 0; error: - fput(bdev_file); + bdev_fput(bdev_file); return ret; } @@ -346,7 +340,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; int ret; struct btrfs_root *dev_root = fs_info->dev_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *eb; struct btrfs_dev_replace_item *ptr; @@ -365,16 +359,15 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) key.offset = 0; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { btrfs_warn(fs_info, "error %d while searching for dev_replace item!", ret); - goto out; + return ret; } if (ret == 0 && @@ -395,7 +388,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) btrfs_warn(fs_info, "delete too small dev_replace item failed %d!", ret); - goto out; + return ret; } ret = 1; } @@ -408,7 +401,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) if (ret < 0) { btrfs_warn(fs_info, "insert dev_replace item failed %d!", ret); - goto out; + return ret; } } @@ -441,11 +434,6 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) dev_replace->item_needs_writeback = 0; up_write(&dev_replace->rwsem); - btrfs_mark_buffer_dirty(trans, eb); - -out: - btrfs_free_path(path); - return ret; } @@ -612,7 +600,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return PTR_ERR(src_device); if (btrfs_pinned_by_swapfile(fs_info, src_device)) { - btrfs_warn_in_rcu(fs_info, + btrfs_warn(fs_info, "cannot replace device %s (devid %llu) due to active swapfile", btrfs_dev_name(src_device), src_device->devid); return -ETXTBSY; @@ -649,7 +637,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: - ASSERT(0); + DEBUG_WARN("unexpected STARTED ot SUSPENDED dev-replace state"); ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; up_write(&dev_replace->rwsem); goto leave; @@ -659,7 +647,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, dev_replace->srcdev = src_device; dev_replace->tgtdev = tgt_device; - btrfs_info_in_rcu(fs_info, + btrfs_info(fs_info, "dev_replace from %s (devid %llu) to %s started", btrfs_dev_name(src_device), src_device->devid, @@ -806,17 +794,17 @@ static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, lockdep_assert_held(&srcdev->fs_info->chunk_mutex); - while (find_first_extent_bit(&srcdev->alloc_state, start, - &found_start, &found_end, - CHUNK_ALLOCATED, &cached_state)) { - ret = set_extent_bit(&tgtdev->alloc_state, found_start, - found_end, CHUNK_ALLOCATED, NULL); + while (btrfs_find_first_extent_bit(&srcdev->alloc_state, start, + &found_start, &found_end, + CHUNK_ALLOCATED, &cached_state)) { + ret = btrfs_set_extent_bit(&tgtdev->alloc_state, found_start, + found_end, CHUNK_ALLOCATED, NULL); if (ret) break; start = found_end + 1; } - free_extent_state(cached_state); + btrfs_free_extent_state(cached_state); return ret; } @@ -955,7 +943,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, tgt_device); } else { if (scrub_ret != -ECANCELED) - btrfs_err_in_rcu(fs_info, + btrfs_err(fs_info, "btrfs_scrub_dev(%s, %llu, %s) failed %d", btrfs_dev_name(src_device), src_device->devid, @@ -973,7 +961,7 @@ error: return scrub_ret; } - btrfs_info_in_rcu(fs_info, + btrfs_info(fs_info, "dev_replace from %s (devid %llu) to %s finished", btrfs_dev_name(src_device), src_device->devid, @@ -1121,7 +1109,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) * btrfs_dev_replace_finishing() will handle the * cleanup part */ - btrfs_info_in_rcu(fs_info, + btrfs_info(fs_info, "dev_replace from %s (devid %llu) to %s canceled", btrfs_dev_name(src_device), src_device->devid, btrfs_dev_name(tgt_device)); @@ -1155,7 +1143,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) ret = btrfs_commit_transaction(trans); WARN_ON(ret); - btrfs_info_in_rcu(fs_info, + btrfs_info(fs_info, "suspended dev_replace from %s (devid %llu) to %s canceled", btrfs_dev_name(src_device), src_device->devid, btrfs_dev_name(tgt_device)); @@ -1259,7 +1247,7 @@ static int btrfs_dev_replace_kthread(void *data) progress = btrfs_dev_replace_progress(fs_info); progress = div_u64(progress, 10); - btrfs_info_in_rcu(fs_info, + btrfs_info(fs_info, "continuing dev_replace from %s (devid %llu) to target %s @%u%%", btrfs_dev_name(dev_replace->srcdev), dev_replace->srcdev->devid, @@ -1277,16 +1265,16 @@ static int btrfs_dev_replace_kthread(void *data) return 0; } -int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) +bool __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) { if (!dev_replace->is_valid) - return 0; + return false; switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: - return 0; + return false; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: /* @@ -1301,7 +1289,7 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) */ break; } - return 1; + return true; } void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 23e480efe5e6..b35cecf388f2 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -25,7 +25,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); -int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); +bool __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, struct btrfs_block_group *cache, u64 physical); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 1ea5d8fcfbf7..69863e398e22 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -92,7 +92,6 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, name, name_ptr, name_len); write_extent_buffer(leaf, data, data_ptr, data_len); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); return ret; } @@ -152,7 +151,6 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, name_ptr = (unsigned long)(dir_item + 1); write_extent_buffer(leaf, name->name, name_ptr, name->len); - btrfs_mark_buffer_dirty(trans, leaf); second_insert: /* FIXME, use some real flag for selecting the extra index */ @@ -229,7 +227,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, return di; } -int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir_ino, const struct fscrypt_str *name) { int ret; @@ -238,13 +236,13 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, int data_size; struct extent_buffer *leaf; int slot; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) return -ENOMEM; - key.objectid = dir; + key.objectid = dir_ino; key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name->name, name->len); @@ -253,20 +251,17 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, if (IS_ERR(di)) { ret = PTR_ERR(di); /* Nothing found, we're safe */ - if (ret == -ENOENT) { - ret = 0; - goto out; - } + if (ret == -ENOENT) + return 0; if (ret < 0) - goto out; + return ret; } /* we found an item, look for our name in the item */ if (di) { /* our exact name was found */ - ret = -EEXIST; - goto out; + return -EEXIST; } /* See if there is room in the item to insert this name. */ @@ -275,14 +270,11 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, slot = path->slots[0]; if (data_size + btrfs_item_size(leaf, slot) + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) { - ret = -EOVERFLOW; - } else { - /* plenty of insertion room */ - ret = 0; + return -EOVERFLOW; } -out: - btrfs_free_path(path); - return ret; + + /* Plenty of insertion room. */ + return 0; } /* diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h index 28d69970bc70..e52174a8baf9 100644 --- a/fs/btrfs/dir-item.h +++ b/fs/btrfs/dir-item.h @@ -10,10 +10,11 @@ struct fscrypt_str; struct btrfs_fs_info; struct btrfs_key; struct btrfs_path; +struct btrfs_inode; struct btrfs_root; struct btrfs_trans_handle; -int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir_ino, const struct fscrypt_str *name); int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const struct fscrypt_str *name, struct btrfs_inode *dir, diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index a7c3e221378d..fe9a4bd7e6e6 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -42,21 +42,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, /* Direct lock must be taken before the extent lock. */ if (nowait) { - if (!try_lock_dio_extent(io_tree, lockstart, lockend, cached_state)) + if (!btrfs_try_lock_dio_extent(io_tree, lockstart, lockend, cached_state)) return -EAGAIN; } else { - lock_dio_extent(io_tree, lockstart, lockend, cached_state); + btrfs_lock_dio_extent(io_tree, lockstart, lockend, cached_state); } while (1) { if (nowait) { - if (!try_lock_extent(io_tree, lockstart, lockend, - cached_state)) { + if (!btrfs_try_lock_extent(io_tree, lockstart, lockend, + cached_state)) { ret = -EAGAIN; break; } } else { - lock_extent(io_tree, lockstart, lockend, cached_state); + btrfs_lock_extent(io_tree, lockstart, lockend, cached_state); } /* * We're concerned with the entire range that we're going to be @@ -78,7 +78,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, lockstart, lockend))) break; - unlock_extent(io_tree, lockstart, lockend, cached_state); + btrfs_unlock_extent(io_tree, lockstart, lockend, cached_state); if (ordered) { if (nowait) { @@ -131,7 +131,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, } if (ret) - unlock_dio_extent(io_tree, lockstart, lockend, cached_state); + btrfs_unlock_dio_extent(io_tree, lockstart, lockend, cached_state); return ret; } @@ -151,11 +151,11 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, } ordered = btrfs_alloc_ordered_extent(inode, start, file_extent, - (1 << type) | - (1 << BTRFS_ORDERED_DIRECT)); + (1U << type) | + (1U << BTRFS_ORDERED_DIRECT)); if (IS_ERR(ordered)) { if (em) { - free_extent_map(em); + btrfs_free_extent_map(em); btrfs_drop_extent_map_range(inode, start, start + file_extent->num_bytes - 1, false); } @@ -204,8 +204,7 @@ again: BTRFS_ORDERED_REGULAR); btrfs_dec_block_group_reservations(fs_info, ins.objectid); if (IS_ERR(em)) - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, - 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); return em; } @@ -246,10 +245,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, else type = BTRFS_ORDERED_NOCOW; len = min(len, em->len - (start - em->start)); - block_start = extent_map_block_start(em) + (start - em->start); + block_start = btrfs_extent_map_block_start(em) + (start - em->start); - if (can_nocow_extent(inode, start, &len, - &file_extent, false, false) == 1) { + if (can_nocow_extent(BTRFS_I(inode), start, &len, &file_extent, + false) == 1) { bg = btrfs_inc_nocow_writers(fs_info, block_start); if (bg) can_nocow = true; @@ -265,7 +264,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, nowait); if (ret < 0) { /* Our caller expects us to free the input extent map. */ - free_extent_map(em); + btrfs_free_extent_map(em); *map = NULL; btrfs_dec_nocow_writers(bg); if (nowait && (ret == -ENOSPC || ret == -EDQUOT)) @@ -278,7 +277,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, &file_extent, type); btrfs_dec_nocow_writers(bg); if (type == BTRFS_ORDERED_PREALLOC) { - free_extent_map(em); + btrfs_free_extent_map(em); *map = em2; em = em2; } @@ -291,7 +290,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, dio_data->nocow_done = true; } else { /* Our caller expects us to free the input extent map. */ - free_extent_map(em); + btrfs_free_extent_map(em); *map = NULL; if (nowait) { @@ -440,8 +439,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, start, data_alloc_len, false); if (!ret) dio_data->data_space_reserved = true; - else if (ret && !(BTRFS_I(inode)->flags & - (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) + else if (!(BTRFS_I(inode)->flags & + (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) goto err; } @@ -474,8 +473,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, * to buffered IO. Don't blame me, this is the price we pay for using * the generic code. */ - if (extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) { - free_extent_map(em); + if (btrfs_extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) { + btrfs_free_extent_map(em); /* * If we are in a NOWAIT context, return -EAGAIN in order to * fallback to buffered IO. This is not only because we can @@ -516,7 +515,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, * after we have submitted bios for all the extents in the range. */ if ((flags & IOMAP_NOWAIT) && len < length) { - free_extent_map(em); + btrfs_free_extent_map(em); ret = -EAGAIN; goto unlock_err; } @@ -558,13 +557,13 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; } else { - iomap->addr = extent_map_block_start(em) + (start - em->start); + iomap->addr = btrfs_extent_map_block_start(em) + (start - em->start); iomap->type = IOMAP_MAPPED; } iomap->offset = start; iomap->bdev = fs_info->fs_devices->latest_dev->bdev; iomap->length = len; - free_extent_map(em); + btrfs_free_extent_map(em); /* * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed, @@ -575,13 +574,13 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (write) unlock_bits |= EXTENT_DIO_LOCKED; - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - unlock_bits, &cached_state); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + unlock_bits, &cached_state); /* We didn't use everything, unlock the dio extent for the remainder. */ if (!write && (start + len) < lockend) - unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len, - lockend, NULL); + btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len, + lockend, NULL); return 0; @@ -591,8 +590,8 @@ unlock_err: * to update this, be explicit that we expect EXTENT_LOCKED and * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing. */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state); err: if (dio_data->data_space_reserved) { btrfs_free_reserved_data_space(BTRFS_I(inode), @@ -615,8 +614,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, if (!write && (iomap->type == IOMAP_HOLE)) { /* If reading from a hole, unlock and return */ - unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1, NULL); + btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1, NULL); return 0; } @@ -627,8 +626,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, btrfs_finish_ordered_extent(dio_data->ordered, NULL, pos, length, false); else - unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1, NULL); + btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1, NULL); ret = -ENOTBLK; } if (write) { @@ -660,8 +659,8 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio) dip->file_offset, dip->bytes, !bio->bi_status); } else { - unlock_dio_extent(&inode->io_tree, dip->file_offset, - dip->file_offset + dip->bytes - 1, NULL); + btrfs_unlock_dio_extent(&inode->io_tree, dip->file_offset, + dip->file_offset + dip->bytes - 1, NULL); } bbio->bio.bi_private = bbio->private; @@ -692,9 +691,9 @@ static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, * a pre-existing one. */ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { - ret = split_extent_map(bbio->inode, bbio->file_offset, - ordered->num_bytes, len, - ordered->disk_bytenr); + ret = btrfs_split_extent_map(bbio->inode, bbio->file_offset, + ordered->num_bytes, len, + ordered->disk_bytenr); if (ret) return ret; } @@ -856,6 +855,22 @@ relock: btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto buffered; } + /* + * We can't control the folios being passed in, applications can write + * to them while a direct IO write is in progress. This means the + * content might change after we calculated the data checksum. + * Therefore we can end up storing a checksum that doesn't match the + * persisted data. + * + * To be extra safe and avoid false data checksum mismatch, if the + * inode requires data checksum, just fallback to buffered IO. + * For buffered IO we have full control of page cache and can ensure + * no one is modifying the content during writeback. + */ + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + goto buffered; + } /* * The iov_iter can be mapped to the same file range we are writing to. diff --git a/fs/btrfs/direct-io.h b/fs/btrfs/direct-io.h index 3dc3ea926afe..df5d45ee6de7 100644 --- a/fs/btrfs/direct-io.h +++ b/fs/btrfs/direct-io.h @@ -5,6 +5,8 @@ #include <linux/types.h> +struct kiocb; + int __init btrfs_init_dio(void); void __cold btrfs_destroy_dio(void); diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index e815d165cccc..89fe85778115 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -94,8 +94,6 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { lockdep_assert_held(&discard_ctl->lock); - if (!btrfs_run_discard_work(discard_ctl)) - return; if (list_empty(&block_group->discard_list) || block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) { @@ -118,6 +116,9 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, if (!btrfs_is_block_group_data_only(block_group)) return; + if (!btrfs_run_discard_work(discard_ctl)) + return; + spin_lock(&discard_ctl->lock); __add_to_discard_list(discard_ctl, block_group); spin_unlock(&discard_ctl->lock); @@ -167,13 +168,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, block_group->discard_eligible_time = 0; queued = !list_empty(&block_group->discard_list); list_del_init(&block_group->discard_list); - /* - * If the block group is currently running in the discard workfn, we - * don't want to deref it, since it's still being used by the workfn. - * The workfn will notice this case and deref the block group when it is - * finished. - */ - if (queued && !running) + if (queued) btrfs_put_block_group(block_group); spin_unlock(&discard_ctl->lock); @@ -250,6 +245,20 @@ again: block_group->used != 0) { if (btrfs_is_block_group_data_only(block_group)) { __add_to_discard_list(discard_ctl, block_group); + /* + * The block group must have been moved to other + * discard list even if discard was disabled in + * the meantime or a transaction abort happened, + * otherwise we can end up in an infinite loop, + * always jumping into the 'again' label and + * keep getting this block group over and over + * in case there are no other block groups in + * the discard lists. + */ + ASSERT(block_group->discard_index != + BTRFS_DISCARD_INDEX_UNUSED, + "discard_index=%d", + block_group->discard_index); } else { list_del_init(&block_group->discard_list); btrfs_put_block_group(block_group); @@ -260,9 +269,10 @@ again: block_group->discard_cursor = block_group->start; block_group->discard_state = BTRFS_DISCARD_EXTENTS; } - discard_ctl->block_group = block_group; } if (block_group) { + btrfs_get_block_group(block_group); + discard_ctl->block_group = block_group; *discard_state = block_group->discard_state; *discard_index = block_group->discard_index; } @@ -493,9 +503,20 @@ static void btrfs_discard_workfn(struct work_struct *work) block_group = peek_discard_list(discard_ctl, &discard_state, &discard_index, now); - if (!block_group || !btrfs_run_discard_work(discard_ctl)) + if (!block_group) + return; + if (!btrfs_run_discard_work(discard_ctl)) { + spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); + discard_ctl->block_group = NULL; + spin_unlock(&discard_ctl->lock); return; + } if (now < block_group->discard_eligible_time) { + spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); + discard_ctl->block_group = NULL; + spin_unlock(&discard_ctl->lock); btrfs_discard_schedule_work(discard_ctl, false); return; } @@ -547,15 +568,7 @@ static void btrfs_discard_workfn(struct work_struct *work) spin_lock(&discard_ctl->lock); discard_ctl->prev_discard = trimmed; discard_ctl->prev_discard_time = now; - /* - * If the block group was removed from the discard list while it was - * running in this workfn, then we didn't deref it, since this function - * still owned that reference. But we set the discard_ctl->block_group - * back to NULL, so we can use that condition to know that now we need - * to deref the block_group. - */ - if (discard_ctl->block_group == NULL) - btrfs_put_block_group(block_group); + btrfs_put_block_group(block_group); discard_ctl->block_group = NULL; __btrfs_discard_schedule_work(discard_ctl, now, false); spin_unlock(&discard_ctl->lock); diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h index dddb0f9101ba..2c5e85394092 100644 --- a/fs/btrfs/discard.h +++ b/fs/btrfs/discard.h @@ -3,6 +3,7 @@ #ifndef BTRFS_DISCARD_H #define BTRFS_DISCARD_H +#include <linux/types.h> #include <linux/sizes.h> struct btrfs_fs_info; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 814320948645..70fc4e7cc5a0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -182,22 +182,22 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios = num_extent_folios(eb); int ret = 0; if (sb_rdonly(fs_info->sb)) return -EROFS; - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; u64 start = max_t(u64, eb->start, folio_pos(folio)); u64 end = min_t(u64, eb->start + eb->len, folio_pos(folio) + eb->folio_size); u32 len = end - start; + phys_addr_t paddr = PFN_PHYS(folio_pfn(folio)) + + offset_in_folio(folio, start); - ret = btrfs_repair_io_failure(fs_info, 0, start, len, - start, folio, offset_in_folio(folio, start), - mirror_num); + ret = btrfs_repair_io_failure(fs_info, 0, start, len, start, + paddr, mirror_num); if (ret) break; } @@ -225,8 +225,7 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb, ASSERT(check); while (1) { - clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); - ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num, check); + ret = read_extent_buffer_pages(eb, mirror_num, check); if (!ret) break; @@ -257,7 +256,7 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb, /* * Checksum a dirty tree block before IO. */ -blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) +int btree_csum_one_bio(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; @@ -268,9 +267,9 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) /* Btree blocks are always contiguous on disk. */ if (WARN_ON_ONCE(bbio->file_offset != eb->start)) - return BLK_STS_IOERR; + return -EIO; if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len)) - return BLK_STS_IOERR; + return -EIO; /* * If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't @@ -279,14 +278,13 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) */ if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) { memzero_extent_buffer(eb, 0, eb->len); - return BLK_STS_OK; + return 0; } if (WARN_ON_ONCE(found_start != eb->start)) - return BLK_STS_IOERR; - if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0], - eb->start, eb->len))) - return BLK_STS_IOERR; + return -EIO; + if (WARN_ON(!btrfs_meta_folio_test_uptodate(eb->folios[0], eb))) + return -EIO; ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, offsetof(struct btrfs_header, fsid), @@ -314,7 +312,7 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) goto error; } write_extent_buffer(eb, result, 0, fs_info->csum_size); - return BLK_STS_OK; + return 0; error: btrfs_print_tree(eb, 0); @@ -328,7 +326,7 @@ error: */ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) || btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID); - return errno_to_blk_status(ret); + return ret; } static bool check_tree_block_fsid(struct extent_buffer *eb) @@ -454,15 +452,9 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, goto out; } - /* - * If this is a leaf block and it is corrupt, set the corrupt bit so - * that we don't try and read the other copies of this block, just - * return -EIO. - */ - if (found_level == 0 && btrfs_check_leaf(eb)) { - set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); + /* If this is a leaf block and it is corrupt, just return -EIO. */ + if (found_level == 0 && btrfs_check_leaf(eb)) ret = -EIO; - } if (found_level > 0 && btrfs_check_node(eb)) ret = -EIO; @@ -643,11 +635,16 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, } -static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, - u64 objectid) +static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, + u64 objectid, gfp_t flags) { + struct btrfs_root *root; bool dummy = btrfs_is_testing(fs_info); + root = kzalloc(sizeof(*root), flags); + if (!root) + return NULL; + memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); @@ -700,10 +697,10 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, btrfs_set_root_last_log_commit(root, 0); root->anon_dev = 0; if (!dummy) { - extent_io_tree_init(fs_info, &root->dirty_log_pages, - IO_TREE_ROOT_DIRTY_LOG_PAGES); - extent_io_tree_init(fs_info, &root->log_csum_range, - IO_TREE_LOG_CSUM_RANGE); + btrfs_extent_io_tree_init(fs_info, &root->dirty_log_pages, + IO_TREE_ROOT_DIRTY_LOG_PAGES); + btrfs_extent_io_tree_init(fs_info, &root->log_csum_range, + IO_TREE_LOG_CSUM_RANGE); } spin_lock_init(&root->root_item_lock); @@ -714,14 +711,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, list_add_tail(&root->leak_list, &fs_info->allocated_roots); spin_unlock(&fs_info->fs_roots_radix_lock); #endif -} -static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, - u64 objectid, gfp_t flags) -{ - struct btrfs_root *root = kzalloc(sizeof(*root), flags); - if (root) - __setup_root(root, fs_info, objectid); return root; } @@ -894,7 +884,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_set_root_used(&root->root_item, leaf->len); btrfs_set_root_last_snapshot(&root->root_item, 0); btrfs_set_root_dirid(&root->root_item, 0); - if (is_fstree(objectid)) + if (btrfs_is_fstree(objectid)) generate_random_guid(root->root_item.uuid); else export_guid(root->root_item.uuid, &guid_null); @@ -1089,21 +1079,22 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, const struct btrfs_key *key) { struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) return ERR_PTR(-ENOMEM); root = read_tree_root_path(tree_root, path, key); - btrfs_free_path(path); return root; } /* - * Initialize subvolume root in-memory structure + * Initialize subvolume root in-memory structure. * * @anon_dev: anonymous device to attach to the root, if zero, allocate new + * + * In case of failure the caller is responsible to call btrfs_free_fs_root() */ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) { @@ -1113,7 +1104,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) if (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && !btrfs_is_data_reloc_root(root) && - is_fstree(btrfs_root_id(root))) { + btrfs_is_fstree(btrfs_root_id(root))) { set_bit(BTRFS_ROOT_SHAREABLE, &root->state); btrfs_check_and_init_root_item(&root->root_item); } @@ -1122,12 +1113,12 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) * Don't assign anonymous block device to roots that are not exposed to * userspace, the id pool is limited to 1M */ - if (is_fstree(btrfs_root_id(root)) && + if (btrfs_is_fstree(btrfs_root_id(root)) && btrfs_root_refs(&root->root_item) > 0) { if (!anon_dev) { ret = get_anon_bdev(&root->anon_dev); if (ret) - goto fail; + return ret; } else { root->anon_dev = anon_dev; } @@ -1137,7 +1128,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) ret = btrfs_init_root_free_objectid(root); if (ret) { mutex_unlock(&root->objectid_mutex); - goto fail; + return ret; } ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); @@ -1145,9 +1136,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) mutex_unlock(&root->objectid_mutex); return 0; -fail: - /* The caller is responsible to call btrfs_free_fs_root */ - return ret; } static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, @@ -1258,6 +1246,9 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { struct percpu_counter *em_counter = &fs_info->evictable_extent_maps; + if (fs_info->fs_devices) + btrfs_close_devices(fs_info->fs_devices); + percpu_counter_destroy(&fs_info->stats_read_blocks); percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); percpu_counter_destroy(&fs_info->ordered_bytes); @@ -1326,7 +1317,7 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, * This is namely for free-space-tree and quota tree, which can change * at runtime and should only be grabbed from fs_info. */ - if (!is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) + if (!btrfs_is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) return ERR_PTR(-ENOENT); again: root = btrfs_lookup_fs_root(fs_info, objectid); @@ -1567,7 +1558,7 @@ static int transaction_kthread(void *arg) do { cannot_commit = false; - delay = msecs_to_jiffies(fs_info->commit_interval * 1000); + delay = secs_to_jiffies(fs_info->commit_interval); mutex_lock(&fs_info->transaction_kthread_mutex); spin_lock(&fs_info->trans_lock); @@ -1582,9 +1573,9 @@ static int transaction_kthread(void *arg) cur->state < TRANS_STATE_COMMIT_PREP && delta < fs_info->commit_interval) { spin_unlock(&fs_info->trans_lock); - delay -= msecs_to_jiffies((delta - 1) * 1000); + delay -= secs_to_jiffies(delta - 1); delay = min(delay, - msecs_to_jiffies(fs_info->commit_interval * 1000)); + secs_to_jiffies(fs_info->commit_interval)); goto sleep; } transid = cur->transid; @@ -1846,6 +1837,8 @@ void btrfs_put_root(struct btrfs_root *root) if (refcount_dec_and_test(&root->refs)) { if (WARN_ON(!xa_empty(&root->inodes))) xa_destroy(&root->inodes); + if (WARN_ON(!xa_empty(&root->delayed_nodes))) + xa_destroy(&root->delayed_nodes); WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)); if (root->anon_dev) free_anon_bdev(root->anon_dev); @@ -1866,8 +1859,8 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) int i; while (!list_empty(&fs_info->dead_roots)) { - gang[0] = list_entry(fs_info->dead_roots.next, - struct btrfs_root, root_list); + gang[0] = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); list_del(&gang[0]->root_list); if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) @@ -1930,9 +1923,9 @@ static int btrfs_init_btree_inode(struct super_block *sb) inode->i_mapping->a_ops = &btree_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, - IO_TREE_BTREE_INODE_IO); - extent_map_tree_init(&BTRFS_I(inode)->extent_tree); + btrfs_extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, + IO_TREE_BTREE_INODE_IO); + btrfs_extent_map_tree_init(&BTRFS_I(inode)->extent_tree); BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); @@ -1956,7 +1949,6 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) fs_info->qgroup_tree = RB_ROOT; INIT_LIST_HEAD(&fs_info->dirty_qgroups); fs_info->qgroup_seq = 1; - fs_info->qgroup_ulist = NULL; fs_info->qgroup_rescan_running = false; fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT; mutex_init(&fs_info->qgroup_rescan_lock); @@ -2005,7 +1997,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) btrfs_alloc_ordered_workqueue(fs_info, "qgroup-rescan", ordered_flags); fs_info->discard_ctl.discard_workers = - alloc_ordered_workqueue("btrfs_discard", WQ_FREEZABLE); + alloc_ordered_workqueue("btrfs-discard", WQ_FREEZABLE); if (!(fs_info->workers && fs_info->delalloc_workers && fs_info->flush_workers && @@ -2037,14 +2029,10 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) fs_info->csum_shash = csum_shash; - /* - * Check if the checksum implementation is a fast accelerated one. - * As-is this is a bit of a hack and should be replaced once the csum - * implementations provide that information themselves. - */ + /* Check if the checksum implementation is a fast accelerated one. */ switch (csum_type) { case BTRFS_CSUM_TYPE_CRC32: - if (!strstr(crypto_shash_driver_name(csum_shash), "generic")) + if (crc32_optimizations() & CRC32C_OPTIMIZATION) set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); break; case BTRFS_CSUM_TYPE_XXHASH: @@ -2167,8 +2155,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, found = true; root = read_tree_root_path(tree_root, path, &key); if (IS_ERR(root)) { - if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) - ret = PTR_ERR(root); + ret = PTR_ERR(root); break; } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); @@ -2199,8 +2186,8 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, static int load_global_roots(struct btrfs_root *tree_root) { - struct btrfs_path *path; - int ret = 0; + BTRFS_PATH_AUTO_FREE(path); + int ret; path = btrfs_alloc_path(); if (!path) @@ -2209,18 +2196,17 @@ static int load_global_roots(struct btrfs_root *tree_root) ret = load_global_roots_objectid(tree_root, path, BTRFS_EXTENT_TREE_OBJECTID, "extent"); if (ret) - goto out; + return ret; ret = load_global_roots_objectid(tree_root, path, BTRFS_CSUM_TREE_OBJECTID, "csum"); if (ret) - goto out; + return ret; if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE)) - goto out; + return ret; ret = load_global_roots_objectid(tree_root, path, BTRFS_FREE_SPACE_TREE_OBJECTID, "free space"); -out: - btrfs_free_path(path); + return ret; } @@ -2327,6 +2313,71 @@ out: return ret; } +static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info, + const struct btrfs_super_block *sb) +{ + unsigned int cur = 0; /* Offset inside the sys chunk array */ + /* + * At sb read time, fs_info is not fully initialized. Thus we have + * to use super block sectorsize, which should have been validated. + */ + const u32 sectorsize = btrfs_super_sectorsize(sb); + u32 sys_array_size = btrfs_super_sys_array_size(sb); + + if (sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { + btrfs_err(fs_info, "system chunk array too big %u > %u", + sys_array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); + return -EUCLEAN; + } + + while (cur < sys_array_size) { + struct btrfs_disk_key *disk_key; + struct btrfs_chunk *chunk; + struct btrfs_key key; + u64 type; + u16 num_stripes; + u32 len; + int ret; + + disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur); + len = sizeof(*disk_key); + + if (cur + len > sys_array_size) + goto short_read; + cur += len; + + btrfs_disk_key_to_cpu(&key, disk_key); + if (key.type != BTRFS_CHUNK_ITEM_KEY) { + btrfs_err(fs_info, + "unexpected item type %u in sys_array at offset %u", + key.type, cur); + return -EUCLEAN; + } + chunk = (struct btrfs_chunk *)(sb->sys_chunk_array + cur); + num_stripes = btrfs_stack_chunk_num_stripes(chunk); + if (cur + btrfs_chunk_item_size(num_stripes) > sys_array_size) + goto short_read; + type = btrfs_stack_chunk_type(chunk); + if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) { + btrfs_err(fs_info, + "invalid chunk type %llu in sys_array at offset %u", + type, cur); + return -EUCLEAN; + } + ret = btrfs_check_chunk_valid(fs_info, NULL, chunk, key.offset, + sectorsize); + if (ret < 0) + return ret; + cur += btrfs_chunk_item_size(num_stripes); + } + return 0; +short_read: + btrfs_err(fs_info, + "super block sys chunk array short read, cur=%u sys_array_size=%u", + cur, sys_array_size); + return -EUCLEAN; +} + /* * Real super block validation * NOTE: super csum type and incompat features will not be checked here. @@ -2381,21 +2432,27 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, * Check sectorsize and nodesize first, other check will need it. * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. */ - if (!is_power_of_2(sectorsize) || sectorsize < 4096 || + if (!is_power_of_2(sectorsize) || sectorsize < BTRFS_MIN_BLOCKSIZE || sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); ret = -EINVAL; } /* - * We only support at most two sectorsizes: 4K and PAGE_SIZE. + * We only support at most 3 sectorsizes: 4K, PAGE_SIZE, MIN_BLOCKSIZE. + * + * For 4K page sized systems with non-debug builds, all 3 matches (4K). + * For 4K page sized systems with debug builds, there are two block sizes + * supported. (4K and 2K) * * We can support 16K sectorsize with 64K page size without problem, * but such sectorsize/pagesize combination doesn't make much sense. * 4K will be our future standard, PAGE_SIZE is supported from the very * beginning. */ - if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) { + if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && + sectorsize != PAGE_SIZE && + sectorsize != BTRFS_MIN_BLOCKSIZE)) { btrfs_err(fs_info, "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); @@ -2495,6 +2552,11 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, ret = -EINVAL; } + if (ret) + return ret; + + ret = validate_sys_chunk_array(fs_info, sb); + /* * Obvious sys_chunk_array corruptions, it must hold at least one key * and one chunk @@ -2697,10 +2759,21 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) return ret; } +/* + * Lockdep gets confused between our buffer_tree which requires IRQ locking because + * we modify marks in the IRQ context, and our delayed inode xarray which doesn't + * have these requirements. Use a class key so lockdep doesn't get them mixed up. + */ +static struct lock_class_key buffer_xa_class; + void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) { INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); - INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); + + /* Use the same flags as mapping->i_pages. */ + xa_init_flags(&fs_info->buffer_tree, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); + lockdep_set_class(&fs_info->buffer_tree.xa_lock, &buffer_xa_class); + INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); @@ -2712,7 +2785,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->super_lock); - spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); spin_lock_init(&fs_info->treelog_bg_lock); spin_lock_init(&fs_info->zone_active_bgs_lock); @@ -2757,6 +2829,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) BTRFS_BLOCK_RSV_GLOBAL); btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); + btrfs_init_block_rsv(&fs_info->treelog_rsv, BTRFS_BLOCK_RSV_TREELOG); btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); btrfs_init_block_rsv(&fs_info->delayed_block_rsv, BTRFS_BLOCK_RSV_DELOPS); @@ -2790,8 +2863,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) rwlock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT_CACHED; - extent_io_tree_init(fs_info, &fs_info->excluded_extents, - IO_TREE_FS_EXCLUDED_EXTENTS); + btrfs_extent_io_tree_init(fs_info, &fs_info->excluded_extents, + IO_TREE_FS_EXCLUDED_EXTENTS); mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); @@ -2856,6 +2929,10 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block if (ret) return ret; + ret = percpu_counter_init(&fs_info->stats_read_blocks, 0, GFP_KERNEL); + if (ret) + return ret; + fs_info->dirty_metadata_batch = PAGE_SIZE * (1 + ilog2(nr_cpu_ids)); @@ -3239,7 +3316,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device /* * Read super block and check the signature bytes only */ - disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev); + disk_super = btrfs_read_disk_super(fs_devices->latest_dev->bdev, 0, false); if (IS_ERR(disk_super)) { ret = PTR_ERR(disk_super); goto fail_alloc; @@ -3316,11 +3393,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); fs_info->nodesize = nodesize; + fs_info->nodesize_bits = ilog2(nodesize); fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); - fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; + fs_info->fs_devices->fs_info = fs_info; /* * Handle the space caching options appropriately now that we have the @@ -3343,11 +3421,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize); - if (sectorsize < PAGE_SIZE) - btrfs_warn(fs_info, - "read-write for sector size %u with page size %lu is experimental", - sectorsize, PAGE_SIZE); - ret = btrfs_init_workqueues(fs_info); if (ret) goto fail_sb_buffer; @@ -3486,6 +3559,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_sysfs; } + btrfs_zoned_reserve_data_reloc_bg(fs_info); btrfs_free_zone_cache(fs_info); btrfs_check_active_zone_reservation(fs_info); @@ -3606,7 +3680,6 @@ fail_alloc: iput(fs_info->btree_inode); fail: - btrfs_close_devices(fs_info->fs_devices); ASSERT(ret < 0); return ret; } @@ -3619,7 +3692,7 @@ static void btrfs_end_super_write(struct bio *bio) bio_for_each_folio_all(fi, bio) { if (bio->bi_status) { - btrfs_warn_rl_in_rcu(device->fs_info, + btrfs_warn_rl(device->fs_info, "lost super block write due to IO error on %s (%d)", btrfs_dev_name(device), blk_status_to_errno(bio->bi_status)); @@ -3639,85 +3712,6 @@ static void btrfs_end_super_write(struct bio *bio) bio_put(bio); } -struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, - int copy_num, bool drop_cache) -{ - struct btrfs_super_block *super; - struct page *page; - u64 bytenr, bytenr_orig; - struct address_space *mapping = bdev->bd_mapping; - int ret; - - bytenr_orig = btrfs_sb_offset(copy_num); - ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr); - if (ret == -ENOENT) - return ERR_PTR(-EINVAL); - else if (ret) - return ERR_PTR(ret); - - if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev)) - return ERR_PTR(-EINVAL); - - if (drop_cache) { - /* This should only be called with the primary sb. */ - ASSERT(copy_num == 0); - - /* - * Drop the page of the primary superblock, so later read will - * always read from the device. - */ - invalidate_inode_pages2_range(mapping, - bytenr >> PAGE_SHIFT, - (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT); - } - - page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); - if (IS_ERR(page)) - return ERR_CAST(page); - - super = page_address(page); - if (btrfs_super_magic(super) != BTRFS_MAGIC) { - btrfs_release_disk_super(super); - return ERR_PTR(-ENODATA); - } - - if (btrfs_super_bytenr(super) != bytenr_orig) { - btrfs_release_disk_super(super); - return ERR_PTR(-EINVAL); - } - - return super; -} - - -struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev) -{ - struct btrfs_super_block *super, *latest = NULL; - int i; - u64 transid = 0; - - /* we would like to check all the supers, but that would make - * a btrfs mount succeed after a mkfs from a different FS. - * So, we need to add a special mount option to scan for - * later supers, using BTRFS_SUPER_MIRROR_MAX instead - */ - for (i = 0; i < 1; i++) { - super = btrfs_read_dev_one_super(bdev, i, false); - if (IS_ERR(super)) - continue; - - if (!latest || btrfs_super_generation(super) > transid) { - if (latest) - btrfs_release_disk_super(super); - - latest = super; - transid = btrfs_super_generation(super); - } - } - - return super; -} - /* * Write superblock @sb to the @device. Do not wait for completion, all the * folios we use for writing are locked. @@ -3757,8 +3751,8 @@ static int write_dev_supers(struct btrfs_device *device, continue; } else if (ret < 0) { btrfs_err(device->fs_info, - "couldn't get super block location for mirror %d", - i); + "couldn't get super block location for mirror %d error %d", + i, ret); atomic_inc(&device->sb_write_errors); continue; } @@ -3777,12 +3771,11 @@ static int write_dev_supers(struct btrfs_device *device, GFP_NOFS); if (IS_ERR(folio)) { btrfs_err(device->fs_info, - "couldn't get super block page for bytenr %llu", - bytenr); + "couldn't get super block page for bytenr %llu error %ld", + bytenr, PTR_ERR(folio)); atomic_inc(&device->sb_write_errors); continue; } - ASSERT(folio_order(folio) == 0); offset = offset_in_folio(folio, bytenr); disk_super = folio_address(folio) + offset; @@ -3855,7 +3848,6 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) /* If the folio has been removed, then we know it completed. */ if (IS_ERR(folio)) continue; - ASSERT(folio_order(folio) == 0); /* Folio will be unlocked once the write completes. */ folio_wait_locked(folio); @@ -3998,7 +3990,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) } if (min_tolerated == INT_MAX) { - pr_warn("BTRFS: unknown raid flag: %llu", flags); + btrfs_warn(NULL, "unknown raid flag: %llu", flags); min_tolerated = 0; } @@ -4175,8 +4167,9 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) u64 found_end; found = true; - while (find_first_extent_bit(&trans->dirty_pages, cur, - &found_start, &found_end, EXTENT_DIRTY, &cached)) { + while (btrfs_find_first_extent_bit(&trans->dirty_pages, cur, + &found_start, &found_end, + EXTENT_DIRTY, &cached)) { dirty_bytes += found_end + 1 - found_start; cur = found_end + 1; } @@ -4253,6 +4246,14 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_cleanup_defrag_inodes(fs_info); /* + * Handle the error fs first, as it will flush and wait for all ordered + * extents. This will generate delayed iputs, thus we want to handle + * it first. + */ + if (unlikely(BTRFS_FS_ERROR(fs_info))) + btrfs_error_commit_super(fs_info); + + /* * Wait for any fixup workers to complete. * If we don't wait for them here and they are still running by the time * we call kthread_stop() against the cleaner kthread further below, we @@ -4262,6 +4263,40 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) * already the cleaner, but below we run all pending delayed iputs. */ btrfs_flush_workqueue(fs_info->fixup_workers); + /* + * Similar case here, we have to wait for delalloc workers before we + * proceed below and stop the cleaner kthread, otherwise we trigger a + * use-after-tree on the cleaner kthread task_struct when a delalloc + * worker running submit_compressed_extents() adds a delayed iput, which + * does a wake up on the cleaner kthread, which was already freed below + * when we call kthread_stop(). + */ + btrfs_flush_workqueue(fs_info->delalloc_workers); + + /* + * We can have ordered extents getting their last reference dropped from + * the fs_info->workers queue because for async writes for data bios we + * queue a work for that queue, at btrfs_wq_submit_bio(), that runs + * run_one_async_done() which calls btrfs_bio_end_io() in case the bio + * has an error, and that later function can do the final + * btrfs_put_ordered_extent() on the ordered extent attached to the bio, + * which adds a delayed iput for the inode. So we must flush the queue + * so that we don't have delayed iputs after committing the current + * transaction below and stopping the cleaner and transaction kthreads. + */ + btrfs_flush_workqueue(fs_info->workers); + + /* + * When finishing a compressed write bio we schedule a work queue item + * to finish an ordered extent - btrfs_finish_compressed_write_work() + * calls btrfs_finish_ordered_extent() which in turns does a call to + * btrfs_queue_ordered_fn(), and that queues the ordered extent + * completion either in the endio_write_workers work queue or in the + * fs_info->endio_freespace_worker work queue. We flush those queues + * below, so before we flush them we must flush this queue for the + * workers of compressed writes. + */ + flush_workqueue(fs_info->compressed_write_workers); /* * After we parked the cleaner kthread, ordered extents may have @@ -4274,8 +4309,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) * * So wait for all ongoing ordered extents to complete and then run * delayed iputs. This works because once we reach this point no one - * can either create new ordered extents nor create delayed iputs - * through some other means. + * can create new ordered extents, but delayed iputs can still be added + * by a reclaim worker (see comments further below). * * Also note that btrfs_wait_ordered_roots() is not safe here, because * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent, @@ -4286,6 +4321,10 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_flush_workqueue(fs_info->endio_write_workers); /* Ordered extents for free space inodes. */ btrfs_flush_workqueue(fs_info->endio_freespace_worker); + /* + * Run delayed iputs in case an async reclaim worker is waiting for them + * to be run as mentioned above. + */ btrfs_run_delayed_iputs(fs_info); cancel_work_sync(&fs_info->async_reclaim_work); @@ -4293,6 +4332,18 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->preempt_reclaim_work); cancel_work_sync(&fs_info->em_shrinker_work); + /* + * Run delayed iputs again because an async reclaim worker may have + * added new ones if it was flushing delalloc: + * + * shrink_delalloc() -> btrfs_start_delalloc_roots() -> + * start_delalloc_inodes() -> btrfs_add_delayed_iput() + */ + btrfs_run_delayed_iputs(fs_info); + + /* There should be no more workload to generate new delayed iputs. */ + set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state); + /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); @@ -4321,9 +4372,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "commit super ret %d", ret); } - if (BTRFS_FS_ERROR(fs_info)) - btrfs_error_commit_super(fs_info); - kthread_stop(fs_info->transaction_kthread); kthread_stop(fs_info->cleaner_kthread); @@ -4331,7 +4379,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags); if (btrfs_check_quota_leak(fs_info)) { - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN("qgroup reserved space leaked"); btrfs_err(fs_info, "qgroup reserved space leaked"); } @@ -4378,7 +4426,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) iput(fs_info->btree_inode); btrfs_mapping_tree_free(fs_info); - btrfs_close_devices(fs_info->fs_devices); } void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans, @@ -4446,10 +4493,6 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) /* cleanup FS via transaction */ btrfs_cleanup_transaction(fs_info); - mutex_lock(&fs_info->cleaner_mutex); - btrfs_run_delayed_iputs(fs_info); - mutex_unlock(&fs_info->cleaner_mutex); - down_write(&fs_info->cleanup_work_sem); up_write(&fs_info->cleanup_work_sem); } @@ -4592,9 +4635,9 @@ static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, u64 start = 0; u64 end; - while (find_first_extent_bit(dirty_pages, start, &start, &end, - mark, NULL)) { - clear_extent_bits(dirty_pages, start, end, mark); + while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end, + mark, NULL)) { + btrfs_clear_extent_bit(dirty_pages, start, end, mark, NULL); while (start <= end) { eb = find_extent_buffer(fs_info, start); start += fs_info->nodesize; @@ -4627,14 +4670,14 @@ static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, * the same extent range. */ mutex_lock(&fs_info->unused_bg_unpin_mutex); - if (!find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, &cached_state)) { + if (!btrfs_find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, &cached_state)) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } - clear_extent_dirty(unpin, start, end, &cached_state); - free_extent_state(cached_state); + btrfs_clear_extent_dirty(unpin, start, end, &cached_state); + btrfs_free_extent_state(cached_state); btrfs_error_unpin_extent_range(fs_info, start, end); mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); @@ -4820,7 +4863,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) int btrfs_init_root_free_objectid(struct btrfs_root *root) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; struct extent_buffer *l; struct btrfs_key search_key; @@ -4836,14 +4879,13 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) search_key.offset = (u64)-1; ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) - goto error; + return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. */ - ret = -EUCLEAN; - goto error; + return -EUCLEAN; } if (path->slots[0] > 0) { slot = path->slots[0] - 1; @@ -4854,10 +4896,8 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) } else { root->free_objectid = BTRFS_FIRST_FREE_OBJECTID; } - ret = 0; -error: - btrfs_free_path(path); - return ret; + + return 0; } int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index a7051e2570c1..864a55a96226 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -58,9 +58,6 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, const struct btrfs_super_block *sb, int mirror_num); int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount); int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors); -struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev); -struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, - int copy_num, bool drop_cache); int btrfs_commit_super(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, const struct btrfs_key *key); @@ -96,9 +93,6 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); /* * This function is used to grab the root, and avoid it is freed when we * access it. But it doesn't ensure that the tree is not dropped. - * - * If you want to ensure the whole tree is safe, you should use - * fs_info->subvol_srcu */ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) { @@ -117,7 +111,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int btrfs_read_extent_buffer(struct extent_buffer *buf, const struct btrfs_tree_parent_check *check); -blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio); +int btree_csum_one_bio(struct btrfs_bio *bbio); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index e2b22bea348a..7fc8a3200b40 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -75,7 +75,7 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, { struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root; - struct inode *inode; + struct btrfs_inode *inode; if (objectid < BTRFS_FIRST_FREE_OBJECTID) return ERR_PTR(-ESTALE); @@ -89,12 +89,12 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, if (IS_ERR(inode)) return ERR_CAST(inode); - if (generation != 0 && generation != inode->i_generation) { - iput(inode); + if (generation != 0 && generation != inode->vfs_inode.i_generation) { + iput(&inode->vfs_inode); return ERR_PTR(-ESTALE); } - return d_obtain_alias(inode); + return d_obtain_alias(&inode->vfs_inode); } static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, @@ -145,9 +145,10 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, struct dentry *btrfs_get_parent(struct dentry *child) { - struct inode *dir = d_inode(child); - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_inode *dir = BTRFS_I(d_inode(child)); + struct btrfs_inode *inode; + struct btrfs_root *root = dir->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_root_ref *ref; @@ -159,13 +160,13 @@ struct dentry *btrfs_get_parent(struct dentry *child) if (!path) return ERR_PTR(-ENOMEM); - if (btrfs_ino(BTRFS_I(dir)) == BTRFS_FIRST_FREE_OBJECTID) { + if (btrfs_ino(dir) == BTRFS_FIRST_FREE_OBJECTID) { key.objectid = btrfs_root_id(root); key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; root = fs_info->tree_root; } else { - key.objectid = btrfs_ino(BTRFS_I(dir)); + key.objectid = btrfs_ino(dir); key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; } @@ -210,7 +211,11 @@ struct dentry *btrfs_get_parent(struct dentry *child) found_key.offset, 0); } - return d_obtain_alias(btrfs_iget(key.objectid, root)); + inode = btrfs_iget(key.objectid, root); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return d_obtain_alias(&inode->vfs_inode); fail: btrfs_free_path(path); return ERR_PTR(ret); @@ -219,11 +224,11 @@ fail: static int btrfs_get_name(struct dentry *parent, char *name, struct dentry *child) { - struct inode *inode = d_inode(child); - struct inode *dir = d_inode(parent); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_path *path; - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_inode *inode = BTRFS_I(d_inode(child)); + struct btrfs_inode *dir = BTRFS_I(d_inode(parent)); + struct btrfs_root *root = dir->root; + struct btrfs_fs_info *fs_info = root->fs_info; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_inode_ref *iref; struct btrfs_root_ref *rref; struct extent_buffer *leaf; @@ -233,37 +238,34 @@ static int btrfs_get_name(struct dentry *parent, char *name, int ret; u64 ino; - if (!S_ISDIR(dir->i_mode)) + if (!S_ISDIR(dir->vfs_inode.i_mode)) return -EINVAL; - ino = btrfs_ino(BTRFS_I(inode)); + ino = btrfs_ino(inode); path = btrfs_alloc_path(); if (!path) return -ENOMEM; if (ino == BTRFS_FIRST_FREE_OBJECTID) { - key.objectid = btrfs_root_id(BTRFS_I(inode)->root); + key.objectid = btrfs_root_id(inode->root); key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; root = fs_info->tree_root; } else { key.objectid = ino; - key.offset = btrfs_ino(BTRFS_I(dir)); key.type = BTRFS_INODE_REF_KEY; + key.offset = btrfs_ino(dir); } ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { - btrfs_free_path(path); return ret; } else if (ret > 0) { - if (ino == BTRFS_FIRST_FREE_OBJECTID) { + if (ino == BTRFS_FIRST_FREE_OBJECTID) path->slots[0]--; - } else { - btrfs_free_path(path); + else return -ENOENT; - } } leaf = path->nodes[0]; @@ -280,7 +282,6 @@ static int btrfs_get_name(struct dentry *parent, char *name, } read_extent_buffer(leaf, name, name_ptr, name_len); - btrfs_free_path(path); /* * have to add the null termination to make sure that reconnect_path diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 6d08c100b01d..66361325f6dc 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -42,8 +42,9 @@ static inline void btrfs_extent_state_leak_debug_check(void) struct extent_state *state; while (!list_empty(&states)) { - state = list_entry(states.next, struct extent_state, leak_list); - pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", + state = list_first_entry(&states, struct extent_state, leak_list); + btrfs_err(NULL, + "state leak: start %llu end %llu state %u in tree %d refs %d", state->start, state->end, state->state, extent_state_in_tree(state), refcount_read(&state->refs)); @@ -59,13 +60,12 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, struct extent_io_tree *tree, u64 start, u64 end) { - const struct btrfs_inode *inode; + const struct btrfs_inode *inode = tree->inode; u64 isize; if (tree->owner != IO_TREE_INODE_IO) return; - inode = extent_io_tree_to_inode_const(tree); isize = i_size_read(&inode->vfs_inode); if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { btrfs_debug_rl(inode->root->fs_info, @@ -80,25 +80,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) #endif - -/* - * The only tree allowed to set the inode is IO_TREE_INODE_IO. - */ -static bool is_inode_io_tree(const struct extent_io_tree *tree) -{ - return tree->owner == IO_TREE_INODE_IO; -} - -/* Return the inode if it's valid for the given tree, otherwise NULL. */ -struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree) -{ - if (tree->owner == IO_TREE_INODE_IO) - return tree->inode; - return NULL; -} - /* Read-only access to the inode. */ -const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree) +const struct btrfs_inode *btrfs_extent_io_tree_to_inode(const struct extent_io_tree *tree) { if (tree->owner == IO_TREE_INODE_IO) return tree->inode; @@ -106,15 +89,15 @@ const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_t } /* For read-only access to fs_info. */ -const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree) +const struct btrfs_fs_info *btrfs_extent_io_tree_to_fs_info(const struct extent_io_tree *tree) { if (tree->owner == IO_TREE_INODE_IO) return tree->inode->root->fs_info; return tree->fs_info; } -void extent_io_tree_init(struct btrfs_fs_info *fs_info, - struct extent_io_tree *tree, unsigned int owner) +void btrfs_extent_io_tree_init(struct btrfs_fs_info *fs_info, + struct extent_io_tree *tree, unsigned int owner) { tree->state = RB_ROOT; spin_lock_init(&tree->lock); @@ -129,7 +112,7 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info, * aren't any waiters on any extent state record (EXTENT_LOCK_BITS are never * set on any extent state when calling this function). */ -void extent_io_tree_release(struct extent_io_tree *tree) +void btrfs_extent_io_tree_release(struct extent_io_tree *tree) { struct rb_root root; struct extent_state *state; @@ -148,7 +131,7 @@ void extent_io_tree_release(struct extent_io_tree *tree) * (see wait_extent_bit()). */ ASSERT(!waitqueue_active(&state->wq)); - free_extent_state(state); + btrfs_free_extent_state(state); cond_resched_lock(&tree->lock); } /* @@ -176,7 +159,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) btrfs_leak_debug_add_state(state); refcount_set(&state->refs, 1); init_waitqueue_head(&state->wq); - trace_alloc_extent_state(state, mask, _RET_IP_); + trace_btrfs_alloc_extent_state(state, mask, _RET_IP_); return state; } @@ -188,14 +171,14 @@ static struct extent_state *alloc_extent_state_atomic(struct extent_state *preal return prealloc; } -void free_extent_state(struct extent_state *state) +void btrfs_free_extent_state(struct extent_state *state) { if (!state) return; if (refcount_dec_and_test(&state->refs)) { WARN_ON(extent_state_in_tree(state)); btrfs_leak_debug_del_state(state); - trace_free_extent_state(state, _RET_IP_); + trace_btrfs_free_extent_state(state, _RET_IP_); kmem_cache_free(extent_state_cache, state); } } @@ -222,38 +205,34 @@ static inline struct extent_state *next_state(struct extent_state *state) { struct rb_node *next = rb_next(&state->rb_node); - if (next) - return rb_entry(next, struct extent_state, rb_node); - else - return NULL; + return rb_entry_safe(next, struct extent_state, rb_node); } static inline struct extent_state *prev_state(struct extent_state *state) { struct rb_node *next = rb_prev(&state->rb_node); - if (next) - return rb_entry(next, struct extent_state, rb_node); - else - return NULL; + return rb_entry_safe(next, struct extent_state, rb_node); } /* - * Search @tree for an entry that contains @offset. Such entry would have - * entry->start <= offset && entry->end >= offset. + * Search @tree for an entry that contains @offset or if none exists for the + * first entry that starts and ends after that offset. * * @tree: the tree to search - * @offset: offset that should fall within an entry in @tree + * @offset: search offset * @node_ret: pointer where new node should be anchored (used when inserting an * entry in the tree) * @parent_ret: points to entry which would have been the parent of the entry, * containing @offset * - * Return a pointer to the entry that contains @offset byte address and don't change - * @node_ret and @parent_ret. + * Return a pointer to the entry that contains @offset byte address. + * + * If no such entry exists, return the first entry that starts and ends after + * @offset if one exists, otherwise NULL. * - * If no such entry exists, return pointer to entry that ends before @offset - * and fill parameters @node_ret and @parent_ret, ie. does not return NULL. + * If the returned entry starts at @offset, then @node_ret and @parent_ret + * aren't changed. */ static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree, u64 offset, @@ -282,7 +261,11 @@ static inline struct extent_state *tree_search_for_insert(struct extent_io_tree if (parent_ret) *parent_ret = prev; - /* Search neighbors until we find the first one past the end */ + /* + * Return either the current entry if it contains offset (it ends after + * or at offset) or the first entry that starts and ends after offset if + * one exists, or NULL. + */ while (entry && offset > entry->end) entry = next_state(entry); @@ -346,12 +329,12 @@ static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 return tree_search_for_insert(tree, offset, NULL, NULL); } -static void extent_io_tree_panic(const struct extent_io_tree *tree, - const struct extent_state *state, - const char *opname, - int err) +static void __cold extent_io_tree_panic(const struct extent_io_tree *tree, + const struct extent_state *state, + const char *opname, + int err) { - btrfs_panic(extent_io_tree_to_fs_info(tree), err, + btrfs_panic(btrfs_extent_io_tree_to_fs_info(tree), err, "extent io tree error on %s state start %llu end %llu", opname, state->start, state->end); } @@ -362,13 +345,12 @@ static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *s prev = prev_state(state); if (prev && prev->end == state->start - 1 && prev->state == state->state) { - if (is_inode_io_tree(tree)) - btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree), - state, prev); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_merge_delalloc_extent(tree->inode, state, prev); state->start = prev->start; rb_erase(&prev->rb_node, &tree->state); RB_CLEAR_NODE(&prev->rb_node); - free_extent_state(prev); + btrfs_free_extent_state(prev); } } @@ -378,13 +360,12 @@ static void merge_next_state(struct extent_io_tree *tree, struct extent_state *s next = next_state(state); if (next && next->start == state->end + 1 && next->state == state->state) { - if (is_inode_io_tree(tree)) - btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree), - state, next); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_merge_delalloc_extent(tree->inode, state, next); state->end = next->end; rb_erase(&next->rb_node, &tree->state); RB_CLEAR_NODE(&next->rb_node); - free_extent_state(next); + btrfs_free_extent_state(next); } } @@ -413,8 +394,8 @@ static void set_state_bits(struct extent_io_tree *tree, u32 bits_to_set = bits & ~EXTENT_CTLBITS; int ret; - if (is_inode_io_tree(tree)) - btrfs_set_delalloc_extent(extent_io_tree_to_inode(tree), state, bits); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_set_delalloc_extent(tree->inode, state, bits); ret = add_extent_changeset(state, bits_to_set, changeset, 1); BUG_ON(ret < 0); @@ -459,10 +440,9 @@ static struct extent_state *insert_state(struct extent_io_tree *tree, if (state->end < entry->start) { if (try_merge && end == entry->start && state->state == entry->state) { - if (is_inode_io_tree(tree)) - btrfs_merge_delalloc_extent( - extent_io_tree_to_inode(tree), - state, entry); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_merge_delalloc_extent(tree->inode, + state, entry); entry->start = state->start; merge_prev_state(tree, entry); state->state = 0; @@ -472,10 +452,9 @@ static struct extent_state *insert_state(struct extent_io_tree *tree, } else if (state->end > entry->end) { if (try_merge && entry->end == start && state->state == entry->state) { - if (is_inode_io_tree(tree)) - btrfs_merge_delalloc_extent( - extent_io_tree_to_inode(tree), - state, entry); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_merge_delalloc_extent(tree->inode, + state, entry); entry->end = state->end; merge_next_state(tree, entry); state->state = 0; @@ -527,9 +506,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, struct rb_node *parent = NULL; struct rb_node **node; - if (is_inode_io_tree(tree)) - btrfs_split_delalloc_extent(extent_io_tree_to_inode(tree), orig, - split); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_split_delalloc_extent(tree->inode, orig, split); prealloc->start = orig->start; prealloc->end = split - 1; @@ -549,7 +527,7 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, } else if (prealloc->end > entry->end) { node = &(*node)->rb_right; } else { - free_extent_state(prealloc); + btrfs_free_extent_state(prealloc); return -EEXIST; } } @@ -561,6 +539,18 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, } /* + * Use this during tree iteration to avoid doing next node searches when it's + * not needed (the current record ends at or after the target range's end). + */ +static inline struct extent_state *next_search_state(struct extent_state *state, u64 end) +{ + if (state->end < end) + return next_state(state); + + return NULL; +} + +/* * Utility function to clear some bits in an extent state struct. It will * optionally wake up anyone waiting on this state (wake == 1). * @@ -569,16 +559,15 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, */ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, - u32 bits, int wake, + u32 bits, int wake, u64 end, struct extent_changeset *changeset) { struct extent_state *next; u32 bits_to_clear = bits & ~EXTENT_CTLBITS; int ret; - if (is_inode_io_tree(tree)) - btrfs_clear_delalloc_extent(extent_io_tree_to_inode(tree), state, - bits); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_clear_delalloc_extent(tree->inode, state, bits); ret = add_extent_changeset(state, bits_to_clear, changeset, 0); BUG_ON(ret < 0); @@ -586,17 +575,17 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, if (wake) wake_up(&state->wq); if (state->state == 0) { - next = next_state(state); + next = next_search_state(state, end); if (extent_state_in_tree(state)) { rb_erase(&state->rb_node, &tree->state); RB_CLEAR_NODE(&state->rb_node); - free_extent_state(state); + btrfs_free_extent_state(state); } else { WARN_ON(1); } } else { merge_state(tree, state); - next = next_state(state); + next = next_search_state(state, end); } return next; } @@ -620,18 +609,18 @@ static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask) * * This takes the tree lock, and returns 0 on success and < 0 on error. */ -int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached_state, - struct extent_changeset *changeset) +int btrfs_clear_extent_bit_changeset(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state, + struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *cached; struct extent_state *prealloc = NULL; u64 last_end; - int err; - int clear = 0; - int wake; - int delete = (bits & EXTENT_CLEAR_ALL_BITS); + int ret = 0; + bool clear; + bool wake; + const bool delete = (bits & EXTENT_CLEAR_ALL_BITS); gfp_t mask; set_gfp_mask_from_bits(&bits, &mask); @@ -644,9 +633,8 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (bits & EXTENT_DELALLOC) bits |= EXTENT_NORESERVE; - wake = ((bits & EXTENT_LOCK_BITS) ? 1 : 0); - if (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)) - clear = 1; + wake = (bits & EXTENT_LOCK_BITS); + clear = (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)); again: if (!prealloc) { /* @@ -676,7 +664,7 @@ again: goto hit_next; } if (clear) - free_extent_state(cached); + btrfs_free_extent_state(cached); } /* This search will find the extents that end after our range starts. */ @@ -691,7 +679,7 @@ hit_next: /* The state doesn't have the wanted bits, go ahead. */ if (!(state->state & bits)) { - state = next_state(state); + state = next_search_state(state, end); goto next; } @@ -714,18 +702,24 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) goto search_again; - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, state, "split", err); - + ret = split_state(tree, state, prealloc, start); prealloc = NULL; - if (err) + if (ret) { + extent_io_tree_panic(tree, state, "split", ret); goto out; + } if (state->end <= end) { - state = clear_state_bit(tree, state, bits, wake, changeset); + state = clear_state_bit(tree, state, bits, wake, end, + changeset); goto next; } - goto search_again; + if (need_resched()) + goto search_again; + /* + * Fallthrough and try atomic extent state allocation if needed. + * If it fails we'll jump to 'search_again' retry the allocation + * in non-atomic mode and start the search again. + */ } /* * | ---- desired range ---- | @@ -736,30 +730,31 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) goto search_again; - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, state, "split", err); + ret = split_state(tree, state, prealloc, end + 1); + if (ret) { + extent_io_tree_panic(tree, state, "split", ret); + prealloc = NULL; + goto out; + } if (wake) wake_up(&state->wq); - clear_state_bit(tree, prealloc, bits, wake, changeset); + clear_state_bit(tree, prealloc, bits, wake, end, changeset); prealloc = NULL; goto out; } - state = clear_state_bit(tree, state, bits, wake, changeset); + state = clear_state_bit(tree, state, bits, wake, end, changeset); next: - if (last_end == (u64)-1) + if (last_end >= end) goto out; start = last_end + 1; - if (start <= end && state && !need_resched()) + if (state && !need_resched()) goto hit_next; search_again: - if (start > end) - goto out; spin_unlock(&tree->lock); if (gfpflags_allow_blocking(mask)) cond_resched(); @@ -767,10 +762,9 @@ search_again: out: spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); + btrfs_free_extent_state(prealloc); - return 0; + return ret; } @@ -820,7 +814,7 @@ process_node: schedule(); spin_lock(&tree->lock); finish_wait(&state->wq, &wait); - free_extent_state(state); + btrfs_free_extent_state(state); goto again; } start = state->end + 1; @@ -838,7 +832,7 @@ out: if (cached_state && *cached_state) { state = *cached_state; *cached_state = NULL; - free_extent_state(state); + btrfs_free_extent_state(state); } spin_unlock(&tree->lock); } @@ -877,7 +871,7 @@ static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *t */ state = tree_search(tree, start); while (state) { - if (state->end >= start && (state->state & bits)) + if (state->state & bits) return state; state = next_state(state); } @@ -892,9 +886,9 @@ static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *t * Return true if we find something, and update @start_ret and @end_ret. * Return false if we found nothing. */ -bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits, - struct extent_state **cached_state) +bool btrfs_find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits, + struct extent_state **cached_state) { struct extent_state *state; bool ret = false; @@ -914,13 +908,13 @@ bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, * again. If we haven't found any, clear as well since * it's now useless. */ - free_extent_state(*cached_state); + btrfs_free_extent_state(*cached_state); *cached_state = NULL; if (state) goto got_it; goto out; } - free_extent_state(*cached_state); + btrfs_free_extent_state(*cached_state); *cached_state = NULL; } @@ -952,14 +946,17 @@ out: * contiguous area for given bits. We will search to the first bit we find, and * then walk down the tree until we find a non-contiguous area. The area * returned will be the full contiguous area with the bits set. + * + * Returns true if we found a range with the given bits set, in which case + * @start_ret and @end_ret are updated, or false if no range was found. */ -int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits) +bool btrfs_find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits) { struct extent_state *state; - int ret = 1; + bool ret = false; - ASSERT(!btrfs_fs_incompat(extent_io_tree_to_fs_info(tree), NO_HOLES)); + ASSERT(!btrfs_fs_incompat(btrfs_extent_io_tree_to_fs_info(tree), NO_HOLES)); spin_lock(&tree->lock); state = find_first_extent_bit_state(tree, start, bits); @@ -971,7 +968,7 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, break; *end_ret = state->end; } - ret = 0; + ret = true; } spin_unlock(&tree->lock); return ret; @@ -1046,11 +1043,11 @@ out: * * [start, end] is inclusive This takes the tree lock. */ -static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, u64 *failed_start, - struct extent_state **failed_state, - struct extent_state **cached_state, - struct extent_changeset *changeset) +static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, u64 *failed_start, + struct extent_state **failed_state, + struct extent_state **cached_state, + struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -1129,12 +1126,11 @@ hit_next: set_state_bits(tree, state, bits, changeset); cache_state(state, cached_state); merge_state(tree, state); - if (last_end == (u64)-1) + if (last_end >= end) goto out; start = last_end + 1; state = next_state(state); - if (start < end && state && state->start == start && - !need_resched()) + if (state && state->start == start && !need_resched()) goto hit_next; goto search_again; } @@ -1186,12 +1182,11 @@ hit_next: set_state_bits(tree, state, bits, changeset); cache_state(state, cached_state); merge_state(tree, state); - if (last_end == (u64)-1) + if (last_end >= end) goto out; start = last_end + 1; state = next_state(state); - if (start < end && state && state->start == start && - !need_resched()) + if (state && state->start == start && !need_resched()) goto hit_next; } goto search_again; @@ -1204,14 +1199,8 @@ hit_next: * extent we found. */ if (state->start > start) { - u64 this_end; struct extent_state *inserted_state; - if (end < last_start) - this_end = end; - else - this_end = last_start - 1; - prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) goto search_again; @@ -1221,17 +1210,38 @@ hit_next: * extent. */ prealloc->start = start; - prealloc->end = this_end; + if (end < last_start) + prealloc->end = end; + else + prealloc->end = last_start - 1; + inserted_state = insert_state(tree, prealloc, bits, changeset); if (IS_ERR(inserted_state)) { ret = PTR_ERR(inserted_state); extent_io_tree_panic(tree, prealloc, "insert", ret); + goto out; } cache_state(inserted_state, cached_state); if (inserted_state == prealloc) prealloc = NULL; - start = this_end + 1; + start = inserted_state->end + 1; + + /* Beyond target range, stop. */ + if (start > end) + goto out; + + if (need_resched()) + goto search_again; + + state = next_search_state(inserted_state, end); + /* + * If there's a next state, whether contiguous or not, we don't + * need to unlock and start search agian. If it's not contiguous + * we will end up here and try to allocate a prealloc state and insert. + */ + if (state) + goto hit_next; goto search_again; } /* @@ -1252,8 +1262,11 @@ hit_next: if (!prealloc) goto search_again; ret = split_state(tree, state, prealloc, end + 1); - if (ret) + if (ret) { extent_io_tree_panic(tree, state, "split", ret); + prealloc = NULL; + goto out; + } set_state_bits(tree, prealloc, bits, changeset); cache_state(prealloc, cached_state); @@ -1272,18 +1285,16 @@ search_again: out: spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); + btrfs_free_extent_state(prealloc); return ret; } -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached_state) +int btrfs_set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state) { - return __set_extent_bit(tree, start, end, bits, NULL, NULL, - cached_state, NULL); + return set_extent_bit(tree, start, end, bits, NULL, NULL, cached_state, NULL); } /* @@ -1304,9 +1315,9 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, * * All allocations are done with GFP_NOFS. */ -int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, u32 clear_bits, - struct extent_state **cached_state) +int btrfs_convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, u32 clear_bits, + struct extent_state **cached_state) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -1374,12 +1385,11 @@ hit_next: if (state->start == start && state->end <= end) { set_state_bits(tree, state, bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, clear_bits, 0, NULL); - if (last_end == (u64)-1) + state = clear_state_bit(tree, state, clear_bits, 0, end, NULL); + if (last_end >= end) goto out; start = last_end + 1; - if (start < end && state && state->start == start && - !need_resched()) + if (state && state->start == start && !need_resched()) goto hit_next; goto search_again; } @@ -1406,20 +1416,19 @@ hit_next: goto out; } ret = split_state(tree, state, prealloc, start); - if (ret) - extent_io_tree_panic(tree, state, "split", ret); prealloc = NULL; - if (ret) + if (ret) { + extent_io_tree_panic(tree, state, "split", ret); goto out; + } if (state->end <= end) { set_state_bits(tree, state, bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, clear_bits, 0, NULL); - if (last_end == (u64)-1) + state = clear_state_bit(tree, state, clear_bits, 0, end, NULL); + if (last_end >= end) goto out; start = last_end + 1; - if (start < end && state && state->start == start && - !need_resched()) + if (state && state->start == start && !need_resched()) goto hit_next; } goto search_again; @@ -1432,14 +1441,8 @@ hit_next: * extent we found. */ if (state->start > start) { - u64 this_end; struct extent_state *inserted_state; - if (end < last_start) - this_end = end; - else - this_end = last_start - 1; - prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) { ret = -ENOMEM; @@ -1451,16 +1454,37 @@ hit_next: * extent. */ prealloc->start = start; - prealloc->end = this_end; + if (end < last_start) + prealloc->end = end; + else + prealloc->end = last_start - 1; + inserted_state = insert_state(tree, prealloc, bits, NULL); if (IS_ERR(inserted_state)) { ret = PTR_ERR(inserted_state); extent_io_tree_panic(tree, prealloc, "insert", ret); + goto out; } cache_state(inserted_state, cached_state); if (inserted_state == prealloc) prealloc = NULL; - start = this_end + 1; + start = inserted_state->end + 1; + + /* Beyond target range, stop. */ + if (start > end) + goto out; + + if (need_resched()) + goto search_again; + + state = next_search_state(inserted_state, end); + /* + * If there's a next state, whether contiguous or not, we don't + * need to unlock and start search again. If it's not contiguous + * we will end up here and try to allocate a prealloc state and insert. + */ + if (state) + goto hit_next; goto search_again; } /* @@ -1477,12 +1501,15 @@ hit_next: } ret = split_state(tree, state, prealloc, end + 1); - if (ret) + if (ret) { extent_io_tree_panic(tree, state, "split", ret); + prealloc = NULL; + goto out; + } set_state_bits(tree, prealloc, bits, NULL); cache_state(prealloc, cached_state); - clear_state_bit(tree, prealloc, clear_bits, 0, NULL); + clear_state_bit(tree, prealloc, clear_bits, 0, end, NULL); prealloc = NULL; goto out; } @@ -1497,8 +1524,7 @@ search_again: out: spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); + btrfs_free_extent_state(prealloc); return ret; } @@ -1518,8 +1544,8 @@ out: * spans (last_range_end, end of device]. In this case it's up to the caller to * trim @end_ret to the appropriate size. */ -void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits) +void btrfs_find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits) { struct extent_state *state; struct extent_state *prev = NULL, *next = NULL; @@ -1636,10 +1662,10 @@ out: * all given bits set. If the returned number of bytes is greater than zero * then @start is updated with the offset of the first byte with the bits set. */ -u64 count_range_bits(struct extent_io_tree *tree, - u64 *start, u64 search_end, u64 max_bytes, - u32 bits, int contig, - struct extent_state **cached_state) +u64 btrfs_count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, u64 max_bytes, + u32 bits, int contig, + struct extent_state **cached_state) { struct extent_state *state = NULL; struct extent_state *cached; @@ -1710,7 +1736,7 @@ search: } if (cached_state) { - free_extent_state(*cached_state); + btrfs_free_extent_state(*cached_state); *cached_state = state; if (state) refcount_inc(&state->refs); @@ -1724,16 +1750,16 @@ search: /* * Check if the single @bit exists in the given range. */ -bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit) +bool btrfs_test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit) { - struct extent_state *state = NULL; + struct extent_state *state; bool bitset = false; ASSERT(is_power_of_2(bit)); spin_lock(&tree->lock); state = tree_search(tree, start); - while (state && start <= end) { + while (state) { if (state->start > end) break; @@ -1742,9 +1768,7 @@ bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 break; } - /* If state->end is (u64)-1, start will overflow to 0 */ - start = state->end + 1; - if (start > end || start == 0) + if (state->end >= end) break; state = next_state(state); } @@ -1752,16 +1776,51 @@ bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 return bitset; } +void btrfs_get_range_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 *bits, + struct extent_state **cached_state) +{ + struct extent_state *state; + + /* + * The cached state is currently mandatory and not used to start the + * search, only to cache the first state record found in the range. + */ + ASSERT(cached_state != NULL); + ASSERT(*cached_state == NULL); + + *bits = 0; + + spin_lock(&tree->lock); + state = tree_search(tree, start); + if (state && state->start < end) { + *cached_state = state; + refcount_inc(&state->refs); + } + while (state) { + if (state->start > end) + break; + + *bits |= state->state; + + if (state->end >= end) + break; + + state = next_state(state); + } + spin_unlock(&tree->lock); +} + /* * Check if the whole range [@start,@end) contains the single @bit set. */ -bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, - struct extent_state *cached) +bool btrfs_test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, + struct extent_state *cached) { - struct extent_state *state = NULL; + struct extent_state *state; bool bitset = true; ASSERT(is_power_of_2(bit)); + ASSERT(start < end); spin_lock(&tree->lock); if (cached && extent_state_in_tree(cached) && cached->start <= start && @@ -1769,30 +1828,22 @@ bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, state = cached; else state = tree_search(tree, start); - while (state && start <= end) { + while (state) { if (state->start > start) { bitset = false; break; } - if (state->start > end) - break; - if ((state->state & bit) == 0) { bitset = false; break; } - if (state->end == (u64)-1) + if (state->end >= end) break; - /* - * Last entry (if state->end is (u64)-1 and overflow happens), - * or next entry starts after the range. - */ + /* Next state must start where this one ends. */ start = state->end + 1; - if (start > end || start == 0) - break; state = next_state(state); } @@ -1804,8 +1855,8 @@ bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, } /* Wrappers around set/clear extent bit */ -int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset) +int btrfs_set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset) { /* * We don't support EXTENT_LOCK_BITS yet, as current changeset will @@ -1814,11 +1865,11 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ ASSERT(!(bits & EXTENT_LOCK_BITS)); - return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset); + return set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset); } -int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset) +int btrfs_clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset) { /* * Don't support EXTENT_LOCK_BITS case, same reason as @@ -1826,20 +1877,20 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ ASSERT(!(bits & EXTENT_LOCK_BITS)); - return __clear_extent_bit(tree, start, end, bits, NULL, changeset); + return btrfs_clear_extent_bit_changeset(tree, start, end, bits, NULL, changeset); } -bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - struct extent_state **cached) +bool btrfs_try_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached) { - int err; + int ret; u64 failed_start; - err = __set_extent_bit(tree, start, end, bits, &failed_start, - NULL, cached, NULL); - if (err == -EEXIST) { + ret = set_extent_bit(tree, start, end, bits, &failed_start, NULL, cached, NULL); + if (ret == -EEXIST) { if (failed_start > start) - clear_extent_bit(tree, start, failed_start - 1, bits, cached); + btrfs_clear_extent_bit(tree, start, failed_start - 1, + bits, cached); return 0; } return 1; @@ -1849,35 +1900,54 @@ bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits * Either insert or lock state struct between start and end use mask to tell * us if waiting is desired. */ -int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - struct extent_state **cached_state) +int btrfs_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached_state) { struct extent_state *failed_state = NULL; - int err; + int ret; u64 failed_start; - err = __set_extent_bit(tree, start, end, bits, &failed_start, - &failed_state, cached_state, NULL); - while (err == -EEXIST) { + ret = set_extent_bit(tree, start, end, bits, &failed_start, + &failed_state, cached_state, NULL); + while (ret == -EEXIST) { if (failed_start != start) - clear_extent_bit(tree, start, failed_start - 1, - bits, cached_state); + btrfs_clear_extent_bit(tree, start, failed_start - 1, + bits, cached_state); wait_extent_bit(tree, failed_start, end, bits, &failed_state); - err = __set_extent_bit(tree, start, end, bits, - &failed_start, &failed_state, - cached_state, NULL); + ret = set_extent_bit(tree, start, end, bits, &failed_start, + &failed_state, cached_state, NULL); } - return err; + return ret; +} + +/* + * Get the extent state that follows the given extent state. + * This is meant to be used in a context where we know no other tasks can + * concurrently modify the tree. + */ +struct extent_state *btrfs_next_extent_state(struct extent_io_tree *tree, + struct extent_state *state) +{ + struct extent_state *next; + + spin_lock(&tree->lock); + ASSERT(extent_state_in_tree(state)); + next = next_state(state); + if (next) + refcount_inc(&next->refs); + spin_unlock(&tree->lock); + + return next; } -void __cold extent_state_free_cachep(void) +void __cold btrfs_extent_state_free_cachep(void) { btrfs_extent_state_leak_debug_check(); kmem_cache_destroy(extent_state_cache); } -int __init extent_state_init_cachep(void) +int __init btrfs_extent_state_init_cachep(void) { extent_state_cache = kmem_cache_create("btrfs_extent_state", sizeof(struct extent_state), 0, 0, diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 6ffef1cd37c1..36facca37973 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -17,10 +17,10 @@ struct btrfs_inode; /* Bits for the extent state */ enum { ENUM_BIT(EXTENT_DIRTY), - ENUM_BIT(EXTENT_UPTODATE), ENUM_BIT(EXTENT_LOCKED), ENUM_BIT(EXTENT_DIO_LOCKED), - ENUM_BIT(EXTENT_NEW), + ENUM_BIT(EXTENT_DIRTY_LOG1), + ENUM_BIT(EXTENT_DIRTY_LOG2), ENUM_BIT(EXTENT_DELALLOC), ENUM_BIT(EXTENT_DEFRAG), ENUM_BIT(EXTENT_BOUNDARY), @@ -39,6 +39,11 @@ enum { */ ENUM_BIT(EXTENT_DELALLOC_NEW), /* + * Mark that a range is being locked for finishing an ordered extent. + * Used together with EXTENT_LOCKED. + */ + ENUM_BIT(EXTENT_FINISHING_ORDERED), + /* * When an ordered extent successfully completes for a region marked as * a new delalloc range, use this flag when clearing a new delalloc * range to indicate that the VFS' inode number of bytes should be @@ -130,117 +135,110 @@ struct extent_state { #endif }; -struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree); -const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree); -const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree); +const struct btrfs_inode *btrfs_extent_io_tree_to_inode(const struct extent_io_tree *tree); +const struct btrfs_fs_info *btrfs_extent_io_tree_to_fs_info(const struct extent_io_tree *tree); -void extent_io_tree_init(struct btrfs_fs_info *fs_info, - struct extent_io_tree *tree, unsigned int owner); -void extent_io_tree_release(struct extent_io_tree *tree); -int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - struct extent_state **cached); -bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - struct extent_state **cached); +void btrfs_extent_io_tree_init(struct btrfs_fs_info *fs_info, + struct extent_io_tree *tree, unsigned int owner); +void btrfs_extent_io_tree_release(struct extent_io_tree *tree); +int btrfs_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached); +bool btrfs_try_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached); -static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached) +static inline int btrfs_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached) { - return __lock_extent(tree, start, end, EXTENT_LOCKED, cached); + return btrfs_lock_extent_bits(tree, start, end, EXTENT_LOCKED, cached); } -static inline bool try_lock_extent(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) +static inline bool btrfs_try_lock_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) { - return __try_lock_extent(tree, start, end, EXTENT_LOCKED, cached); + return btrfs_try_lock_extent_bits(tree, start, end, EXTENT_LOCKED, cached); } -int __init extent_state_init_cachep(void); -void __cold extent_state_free_cachep(void); - -u64 count_range_bits(struct extent_io_tree *tree, - u64 *start, u64 search_end, - u64 max_bytes, u32 bits, int contig, - struct extent_state **cached_state); - -void free_extent_state(struct extent_state *state); -bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, - struct extent_state *cached_state); -bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit); -int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset); -int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached, - struct extent_changeset *changeset); - -static inline int clear_extent_bit(struct extent_io_tree *tree, u64 start, - u64 end, u32 bits, - struct extent_state **cached) -{ - return __clear_extent_bit(tree, start, end, bits, cached, NULL); -} +int __init btrfs_extent_state_init_cachep(void); +void __cold btrfs_extent_state_free_cachep(void); -static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached) -{ - return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached, NULL); -} +u64 btrfs_count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, + u64 max_bytes, u32 bits, int contig, + struct extent_state **cached_state); -static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, u32 bits) +void btrfs_free_extent_state(struct extent_state *state); +bool btrfs_test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, + struct extent_state *cached_state); +bool btrfs_test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit); +void btrfs_get_range_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 *bits, + struct extent_state **cached_state); +int btrfs_clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset); +int btrfs_clear_extent_bit_changeset(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached, + struct extent_changeset *changeset); + +static inline int btrfs_clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 end, u32 bits, + struct extent_state **cached) { - return clear_extent_bit(tree, start, end, bits, NULL); + return btrfs_clear_extent_bit_changeset(tree, start, end, bits, cached, NULL); } -int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset); -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached_state); - -static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state) +static inline int btrfs_unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached) { - return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, - cached_state, NULL); + return btrfs_clear_extent_bit_changeset(tree, start, end, EXTENT_LOCKED, + cached, NULL); } -static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) +int btrfs_set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset); +int btrfs_set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state); + +static inline int btrfs_clear_extent_dirty(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) { - return clear_extent_bit(tree, start, end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, cached); + return btrfs_clear_extent_bit(tree, start, end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, cached); } -int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, u32 clear_bits, - struct extent_state **cached_state); - -bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits, - struct extent_state **cached_state); -void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits); -int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits); +int btrfs_convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, u32 clear_bits, + struct extent_state **cached_state); + +bool btrfs_find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits, + struct extent_state **cached_state); +void btrfs_find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits); +bool btrfs_find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits); bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state); -static inline int lock_dio_extent(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) +static inline int btrfs_lock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) { - return __lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached); + return btrfs_lock_extent_bits(tree, start, end, EXTENT_DIO_LOCKED, cached); } -static inline bool try_lock_dio_extent(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) +static inline bool btrfs_try_lock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) { - return __try_lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached); + return btrfs_try_lock_extent_bits(tree, start, end, EXTENT_DIO_LOCKED, cached); } -static inline int unlock_dio_extent(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) +static inline int btrfs_unlock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) { - return __clear_extent_bit(tree, start, end, EXTENT_DIO_LOCKED, cached, NULL); + return btrfs_clear_extent_bit_changeset(tree, start, end, EXTENT_DIO_LOCKED, + cached, NULL); } +struct extent_state *btrfs_next_extent_state(struct extent_io_tree *tree, + struct extent_state *state); + #endif /* BTRFS_EXTENT_IO_TREE_H */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 412e318e4a22..97d517cdf2df 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -46,7 +46,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extra_op); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, @@ -56,12 +56,12 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 flags, u64 owner, u64 offset, struct btrfs_key *ins, int ref_mod, u64 oref_root); static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op); -static int find_next_key(struct btrfs_path *path, int level, +static int find_next_key(const struct btrfs_path *path, int level, struct btrfs_key *key); -static int block_group_bits(struct btrfs_block_group *cache, u64 bits) +static int block_group_bits(const struct btrfs_block_group *cache, u64 bits) { return (cache->flags & bits) == bits; } @@ -70,20 +70,17 @@ static int block_group_bits(struct btrfs_block_group *cache, u64 bits) int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { struct btrfs_root *root = btrfs_extent_root(fs_info, start); - int ret; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = start; - key.offset = len; key.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - btrfs_free_path(path); - return ret; + key.offset = len; + return btrfs_search_slot(NULL, root, &key, path, 0, 0); } /* @@ -103,7 +100,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; u64 num_refs; u64 extent_flags; @@ -125,16 +122,16 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, search_again: key.objectid = bytenr; - key.offset = offset; if (metadata) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = offset; extent_root = btrfs_extent_root(fs_info, bytenr); ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) - goto out_free; + return ret; if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) { if (path->slots[0]) { @@ -159,7 +156,7 @@ search_again: "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); - goto out_free; + return ret; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -170,7 +167,7 @@ search_again: "unexpected zero reference count for extent item (%llu %u %llu)", key.objectid, key.type, key.offset); btrfs_abort_transaction(trans, ret); - goto out_free; + return ret; } extent_flags = btrfs_extent_flags(leaf, ei); owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]); @@ -216,8 +213,7 @@ search_again: *flags = extent_flags; if (owning_root) *owning_root = owner; -out_free: - btrfs_free_path(path); + return ret; } @@ -333,7 +329,7 @@ out_free: * is_data == BTRFS_REF_TYPE_ANY, either type is OK. */ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, - struct btrfs_extent_inline_ref *iref, + const struct btrfs_extent_inline_ref *iref, enum btrfs_inline_ref_type is_data) { struct btrfs_fs_info *fs_info = eb->fs_info; @@ -405,23 +401,23 @@ u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) return ((u64)high_crc << 31) ^ (u64)low_crc; } -static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, - struct btrfs_extent_data_ref *ref) +static u64 hash_extent_data_ref_item(const struct extent_buffer *leaf, + const struct btrfs_extent_data_ref *ref) { return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), btrfs_extent_data_ref_objectid(leaf, ref), btrfs_extent_data_ref_offset(leaf, ref)); } -static int match_extent_data_ref(struct extent_buffer *leaf, - struct btrfs_extent_data_ref *ref, - u64 root_objectid, u64 owner, u64 offset) +static bool match_extent_data_ref(const struct extent_buffer *leaf, + const struct btrfs_extent_data_ref *ref, + u64 root_objectid, u64 owner, u64 offset) { if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || btrfs_extent_data_ref_objectid(leaf, ref) != owner || btrfs_extent_data_ref_offset(leaf, ref) != offset) - return 0; - return 1; + return false; + return true; } static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, @@ -501,7 +497,7 @@ fail: static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, u64 bytenr) { struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); @@ -570,7 +566,6 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, btrfs_set_extent_data_ref_count(leaf, ref, num_refs); } } - btrfs_mark_buffer_dirty(trans, leaf); ret = 0; fail: btrfs_release_path(path); @@ -618,18 +613,17 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); else if (key.type == BTRFS_SHARED_DATA_REF_KEY) btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); - btrfs_mark_buffer_dirty(trans, leaf); } return ret; } -static noinline u32 extent_data_ref_count(struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref) +static noinline u32 extent_data_ref_count(const struct btrfs_path *path, + const struct btrfs_extent_inline_ref *iref) { struct btrfs_key key; struct extent_buffer *leaf; - struct btrfs_extent_data_ref *ref1; - struct btrfs_shared_data_ref *ref2; + const struct btrfs_extent_data_ref *ref1; + const struct btrfs_shared_data_ref *ref2; u32 num_refs = 0; int type; @@ -644,10 +638,10 @@ static noinline u32 extent_data_ref_count(struct btrfs_path *path, type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); ASSERT(type != BTRFS_REF_TYPE_INVALID); if (type == BTRFS_EXTENT_DATA_REF_KEY) { - ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); + ref1 = (const struct btrfs_extent_data_ref *)(&iref->offset); num_refs = btrfs_extent_data_ref_count(leaf, ref1); } else { - ref2 = (struct btrfs_shared_data_ref *)(iref + 1); + ref2 = (const struct btrfs_shared_data_ref *)(iref + 1); num_refs = btrfs_shared_data_ref_count(leaf, ref2); } } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { @@ -690,7 +684,7 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, u64 bytenr) { struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); @@ -728,7 +722,7 @@ static inline int extent_ref_type(u64 parent, u64 owner) return type; } -static int find_next_key(struct btrfs_path *path, int level, +static int find_next_key(const struct btrfs_path *path, int level, struct btrfs_key *key) { @@ -1050,7 +1044,6 @@ void setup_inline_extent_backref(struct btrfs_trans_handle *trans, } else { btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); } - btrfs_mark_buffer_dirty(trans, leaf); } static int lookup_extent_backref(struct btrfs_trans_handle *trans, @@ -1195,7 +1188,6 @@ static noinline_for_stack int update_inline_extent_backref( item_size -= size; btrfs_truncate_item(trans, path, item_size, 1); } - btrfs_mark_buffer_dirty(trans, leaf); return 0; } @@ -1260,12 +1252,12 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, { int j, ret = 0; u64 bytes_left, end; - u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT); + u64 aligned_start = ALIGN(start, SECTOR_SIZE); /* Adjust the range to be aligned to 512B sectors if necessary. */ if (start != aligned_start) { len -= aligned_start - start; - len = round_down(len, 1 << SECTOR_SHIFT); + len = round_down(len, SECTOR_SIZE); start = aligned_start; } @@ -1488,10 +1480,10 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, * */ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_extent_item *item; struct btrfs_key key; @@ -1512,7 +1504,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, node->parent, node->ref_root, owner, offset, refs_to_add, extent_op); if ((ret < 0 && ret != -EAGAIN) || !ret) - goto out; + return ret; /* * Ok we had -EAGAIN which means we didn't have space to insert and @@ -1527,24 +1519,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (extent_op) __run_delayed_extent_op(extent_op, leaf, item); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* now insert the actual backref */ - if (owner < BTRFS_FIRST_FREE_OBJECTID) + if (owner < BTRFS_FIRST_FREE_OBJECTID) { ret = insert_tree_block_ref(trans, path, node, bytenr); - else + if (ret) + btrfs_abort_transaction(trans, ret); + } else { ret = insert_extent_data_ref(trans, path, node, bytenr); + if (ret) + btrfs_abort_transaction(trans, ret); + } - if (ret) - btrfs_abort_transaction(trans, ret); -out: - btrfs_free_path(path); return ret; } static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_head *href) + const struct btrfs_delayed_ref_head *href) { u64 root = href->owning_root; @@ -1553,7 +1545,7 @@ static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info, * where it has already been unset. */ if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE || - !href->is_data || !is_fstree(root)) + !href->is_data || !btrfs_is_fstree(root)) return; btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes, @@ -1562,7 +1554,7 @@ static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info, static int run_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { @@ -1630,13 +1622,13 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, } static int run_delayed_extent_op(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head, + const struct btrfs_delayed_ref_head *head, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_extent_item *ei; struct extent_buffer *leaf; u32 item_size; @@ -1667,7 +1659,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, again: ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { - goto out; + return ret; } else if (ret > 0) { if (metadata) { if (path->slots[0] > 0) { @@ -1684,8 +1676,8 @@ again: metadata = 0; key.objectid = head->bytenr; - key.offset = head->num_bytes; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = head->num_bytes; goto again; } } else { @@ -1693,7 +1685,7 @@ again: btrfs_err(fs_info, "missing extent item for extent %llu num_bytes %llu level %d", head->bytenr, head->num_bytes, head->level); - goto out; + return ret; } } @@ -1706,21 +1698,18 @@ again: "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); - goto out; + return ret; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); __run_delayed_extent_op(extent_op, leaf, ei); - btrfs_mark_buffer_dirty(trans, leaf); -out: - btrfs_free_path(path); return ret; } static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { @@ -1767,7 +1756,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, /* helper function to actually process a single delayed ref entry */ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { @@ -1803,30 +1792,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, return ret; } -static inline struct btrfs_delayed_ref_node * -select_delayed_ref(struct btrfs_delayed_ref_head *head) -{ - struct btrfs_delayed_ref_node *ref; - - if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) - return NULL; - - /* - * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. - * This is to prevent a ref count from going down to zero, which deletes - * the extent item from the extent tree, when there still are references - * to add, which would fail because they would not find the extent item. - */ - if (!list_empty(&head->ref_add_list)) - return list_first_entry(&head->ref_add_list, - struct btrfs_delayed_ref_node, add_list); - - ref = rb_entry(rb_first_cached(&head->ref_tree), - struct btrfs_delayed_ref_node, ref_node); - ASSERT(list_empty(&ref->add_list)); - return ref; -} - static struct btrfs_delayed_extent_op *cleanup_extent_op( struct btrfs_delayed_ref_head *head) { @@ -1959,7 +1924,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, lockdep_assert_held(&locked_ref->mutex); lockdep_assert_held(&locked_ref->lock); - while ((ref = select_delayed_ref(locked_ref))) { + while ((ref = btrfs_select_delayed_ref(locked_ref))) { if (ref->seq && btrfs_check_delayed_seq(fs_info, ref->seq)) { spin_unlock(&locked_ref->lock); @@ -2043,7 +2008,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; if (min_bytes == 0) { - max_count = delayed_refs->num_heads_ready; + /* + * We may be subject to a harmless race if some task is + * concurrently adding or removing a delayed ref, so silence + * KCSAN and similar tools. + */ + max_count = data_race(delayed_refs->num_heads_ready); min_bytes = U64_MAX; } @@ -2230,10 +2200,11 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, return ret; } -static noinline int check_delayed_ref(struct btrfs_root *root, +static noinline int check_delayed_ref(struct btrfs_inode *inode, struct btrfs_path *path, - u64 objectid, u64 offset, u64 bytenr) + u64 offset, u64 bytenr) { + struct btrfs_root *root = inode->root; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_root *delayed_refs; @@ -2307,7 +2278,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, * then we have a cross reference. */ if (ref->ref_root != btrfs_root_id(root) || - ref_owner != objectid || ref_offset != offset) { + ref_owner != btrfs_ino(inode) || ref_offset != offset) { ret = 1; break; } @@ -2318,11 +2289,53 @@ static noinline int check_delayed_ref(struct btrfs_root *root, return ret; } -static noinline int check_committed_ref(struct btrfs_root *root, +/* + * Check if there are references for a data extent other than the one belonging + * to the given inode and offset. + * + * @inode: The only inode we expect to find associated with the data extent. + * @path: A path to use for searching the extent tree. + * @offset: The only offset we expect to find associated with the data extent. + * @bytenr: The logical address of the data extent. + * + * When the extent does not have any other references other than the one we + * expect to find, we always return a value of 0 with the path having a locked + * leaf that contains the extent's extent item - this is necessary to ensure + * we don't race with a task running delayed references, and our caller must + * have such a path when calling check_delayed_ref() - it must lock a delayed + * ref head while holding the leaf locked. In case the extent item is not found + * in the extent tree, we return -ENOENT with the path having the leaf (locked) + * where the extent item should be, in order to prevent races with another task + * running delayed references, so that we don't miss any reference when calling + * check_delayed_ref(). + * + * Note: this may return false positives, and this is because we want to be + * quick here as we're called in write paths (when flushing delalloc and + * in the direct IO write path). For example we can have an extent with + * a single reference but that reference is not inlined, or we may have + * many references in the extent tree but we also have delayed references + * that cancel all the reference except the one for our inode and offset, + * but it would be expensive to do such checks and complex due to all + * locking to avoid races between the checks and flushing delayed refs, + * plus non-inline references may be located on leaves other than the one + * that contains the extent item in the extent tree. The important thing + * here is to not return false negatives and that the false positives are + * not very common. + * + * Returns: 0 if there are no cross references and with the path having a locked + * leaf from the extent tree that contains the extent's extent item. + * + * 1 if there are cross references (false positives can happen). + * + * < 0 in case of an error. In case of -ENOENT the leaf in the extent + * tree where the extent item should be located at is read locked and + * accessible in the given path. + */ +static noinline int check_committed_ref(struct btrfs_inode *inode, struct btrfs_path *path, - u64 objectid, u64 offset, u64 bytenr, - bool strict) + u64 offset, u64 bytenr) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr); struct extent_buffer *leaf; @@ -2336,40 +2349,37 @@ static noinline int check_committed_ref(struct btrfs_root *root, int ret; key.objectid = bytenr; - key.offset = (u64)-1; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. */ - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } - ret = -ENOENT; if (path->slots[0] == 0) - goto out; + return -ENOENT; path->slots[0]--; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) - goto out; + return -ENOENT; - ret = 1; item_size = btrfs_item_size(leaf, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY); /* No inline refs; we need to bail before checking for owner ref. */ if (item_size == sizeof(*ei)) - goto out; + return 1; /* Check for an owner ref; skip over it to the real inline refs. */ iref = (struct btrfs_extent_inline_ref *)(ei + 1); @@ -2377,56 +2387,69 @@ static noinline int check_committed_ref(struct btrfs_root *root, if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) { expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); iref = (struct btrfs_extent_inline_ref *)(iref + 1); + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); } /* If extent item has more than 1 inline ref then it's shared */ if (item_size != expected_size) - goto out; - - /* - * If extent created before last snapshot => it's shared unless the - * snapshot has been deleted. Use the heuristic if strict is false. - */ - if (!strict && - (btrfs_extent_generation(leaf, ei) <= - btrfs_root_last_snapshot(&root->root_item))) - goto out; + return 1; /* If this extent has SHARED_DATA_REF then it's shared */ - type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); if (type != BTRFS_EXTENT_DATA_REF_KEY) - goto out; + return 1; ref = (struct btrfs_extent_data_ref *)(&iref->offset); if (btrfs_extent_refs(leaf, ei) != btrfs_extent_data_ref_count(leaf, ref) || btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) || - btrfs_extent_data_ref_objectid(leaf, ref) != objectid || + btrfs_extent_data_ref_objectid(leaf, ref) != btrfs_ino(inode) || btrfs_extent_data_ref_offset(leaf, ref) != offset) - goto out; + return 1; - ret = 0; -out: - return ret; + return 0; } -int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, - u64 bytenr, bool strict, struct btrfs_path *path) +int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset, + u64 bytenr, struct btrfs_path *path) { int ret; do { - ret = check_committed_ref(root, path, objectid, - offset, bytenr, strict); + ret = check_committed_ref(inode, path, offset, bytenr); if (ret && ret != -ENOENT) goto out; - ret = check_delayed_ref(root, path, objectid, offset, bytenr); - } while (ret == -EAGAIN); + /* + * The path must have a locked leaf from the extent tree where + * the extent item for our extent is located, in case it exists, + * or where it should be located in case it doesn't exist yet + * because it's new and its delayed ref was not yet flushed. + * We need to lock the delayed ref head at check_delayed_ref(), + * if one exists, while holding the leaf locked in order to not + * race with delayed ref flushing, missing references and + * incorrectly reporting that the extent is not shared. + */ + if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { + struct extent_buffer *leaf = path->nodes[0]; + + ASSERT(leaf != NULL); + btrfs_assert_tree_read_locked(leaf); + + if (ret != -ENOENT) { + struct btrfs_key key; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ASSERT(key.objectid == bytenr); + ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY); + } + } + + ret = check_delayed_ref(inode, path, offset, bytenr); + } while (ret == -EAGAIN && !path->nowait); out: btrfs_release_path(path); - if (btrfs_is_data_reloc_root(root)) + if (btrfs_is_data_reloc_root(inode->root)) WARN_ON(ret > 0); return ret; } @@ -2571,13 +2594,10 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group *cache, u64 bytenr, u64 num_bytes, int reserved) { - struct btrfs_fs_info *fs_info = cache->fs_info; - spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; - btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info, - num_bytes); + btrfs_space_info_update_bytes_pinned(cache->space_info, num_bytes); if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; @@ -2585,8 +2605,8 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - set_extent_bit(&trans->transaction->pinned_extents, bytenr, - bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); + btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr, + bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); return 0; } @@ -2724,15 +2744,15 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, { struct btrfs_block_group *cache = NULL; struct btrfs_space_info *space_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_free_cluster *cluster = NULL; - u64 len; u64 total_unpinned = 0; u64 empty_cluster = 0; bool readonly; int ret = 0; while (start <= end) { + u64 len; + readonly = false; if (!cache || start >= cache->start + cache->length) { @@ -2778,37 +2798,19 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; - btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); + btrfs_space_info_update_bytes_pinned(space_info, -len); space_info->max_extent_size = 0; if (cache->ro) { space_info->bytes_readonly += len; readonly = true; } else if (btrfs_is_zoned(fs_info)) { /* Need reset before reusing in a zoned block group */ - btrfs_space_info_update_bytes_zone_unusable(fs_info, space_info, - len); + btrfs_space_info_update_bytes_zone_unusable(space_info, len); readonly = true; } spin_unlock(&cache->lock); - if (!readonly && return_free_space && - global_rsv->space_info == space_info) { - spin_lock(&global_rsv->lock); - if (!global_rsv->full) { - u64 to_add = min(len, global_rsv->size - - global_rsv->reserved); - - global_rsv->reserved += to_add; - btrfs_space_info_update_bytes_may_use(fs_info, - space_info, to_add); - if (global_rsv->reserved >= global_rsv->size) - global_rsv->full = 1; - len -= to_add; - } - spin_unlock(&global_rsv->lock); - } - /* Add to any tickets we may have */ - if (!readonly && return_free_space && len) - btrfs_try_granting_tickets(fs_info, space_info); + if (!readonly && return_free_space) + btrfs_return_free_space(space_info, len); spin_unlock(&space_info->lock); } @@ -2823,34 +2825,63 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *block_group, *tmp; struct list_head *deleted_bgs; - struct extent_io_tree *unpin; + struct extent_io_tree *unpin = &trans->transaction->pinned_extents; + struct extent_state *cached_state = NULL; u64 start; u64 end; + int unpin_error = 0; int ret; - unpin = &trans->transaction->pinned_extents; - - while (!TRANS_ABORTED(trans)) { - struct extent_state *cached_state = NULL; + mutex_lock(&fs_info->unused_bg_unpin_mutex); + btrfs_find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, &cached_state); - mutex_lock(&fs_info->unused_bg_unpin_mutex); - if (!find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, &cached_state)) { - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - break; - } + while (!TRANS_ABORTED(trans) && cached_state) { + struct extent_state *next_state; if (btrfs_test_opt(fs_info, DISCARD_SYNC)) ret = btrfs_discard_extent(fs_info, start, end + 1 - start, NULL); - clear_extent_dirty(unpin, start, end, &cached_state); + next_state = btrfs_next_extent_state(unpin, cached_state); + btrfs_clear_extent_dirty(unpin, start, end, &cached_state); ret = unpin_extent_range(fs_info, start, end, true); - BUG_ON(ret); - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - free_extent_state(cached_state); - cond_resched(); + /* + * If we get an error unpinning an extent range, store the first + * error to return later after trying to unpin all ranges and do + * the sync discards. Our caller will abort the transaction + * (which already wrote new superblocks) and on the next mount + * the space will be available as it was pinned by in-memory + * only structures in this phase. + */ + if (ret) { + btrfs_err_rl(fs_info, +"failed to unpin extent range [%llu, %llu] when committing transaction %llu: %s (%d)", + start, end, trans->transid, + btrfs_decode_error(ret), ret); + if (!unpin_error) + unpin_error = ret; + } + + btrfs_free_extent_state(cached_state); + + if (need_resched()) { + btrfs_free_extent_state(next_state); + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + cond_resched(); + cached_state = NULL; + mutex_lock(&fs_info->unused_bg_unpin_mutex); + btrfs_find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, &cached_state); + } else { + cached_state = next_state; + if (cached_state) { + start = cached_state->start; + end = cached_state->end; + } + } } + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + btrfs_free_extent_state(cached_state); if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) { btrfs_discard_calc_delay(&fs_info->discard_ctl); @@ -2864,16 +2895,20 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) */ deleted_bgs = &trans->transaction->deleted_bgs; list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { - u64 trimmed = 0; - ret = -EROFS; if (!TRANS_ABORTED(trans)) - ret = btrfs_discard_extent(fs_info, - block_group->start, - block_group->length, - &trimmed); + ret = btrfs_discard_extent(fs_info, block_group->start, + block_group->length, NULL); + /* + * Not strictly necessary to lock, as the block_group should be + * read-only from btrfs_delete_unused_bgs(). + */ + ASSERT(block_group->ro); + spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); + spin_unlock(&fs_info->unused_bgs_lock); + btrfs_unfreeze_block_group(block_group); btrfs_put_block_group(block_group); @@ -2885,7 +2920,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) } } - return 0; + return unpin_error; } /* @@ -2965,7 +3000,7 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, return ret; } - ret = add_to_free_space_tree(trans, bytenr, num_bytes); + ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -3046,7 +3081,7 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, */ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *info = trans->fs_info; @@ -3259,7 +3294,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } else { btrfs_set_extent_refs(leaf, ei, refs); - btrfs_mark_buffer_dirty(trans, leaf); } if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, @@ -3481,17 +3515,11 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans, WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(bg, buf->start, buf->len); - btrfs_free_reserved_bytes(bg, buf->len, 0); + btrfs_free_reserved_bytes(bg, buf->len, false); btrfs_put_block_group(bg); trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); out: - - /* - * Deleting the buffer, clear the corrupt flag since it doesn't - * matter anymore. - */ - clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); return 0; } @@ -3623,6 +3651,21 @@ btrfs_release_block_group(struct btrfs_block_group *cache, btrfs_put_block_group(cache); } +static bool find_free_extent_check_size_class(const struct find_free_extent_ctl *ffe_ctl, + const struct btrfs_block_group *bg) +{ + if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) + return true; + if (!btrfs_block_group_should_use_size_class(bg)) + return true; + if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) + return true; + if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && + bg->size_class == BTRFS_BG_SZ_NONE) + return true; + return ffe_ctl->size_class == bg->size_class; +} + /* * Helper function for find_free_extent(). * @@ -3644,7 +3687,8 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg, if (!cluster_bg) goto refill_cluster; if (cluster_bg != bg && (cluster_bg->ro || - !block_group_bits(cluster_bg, ffe_ctl->flags))) + !block_group_bits(cluster_bg, ffe_ctl->flags) || + !find_free_extent_check_size_class(ffe_ctl, cluster_bg))) goto release_cluster; offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, @@ -4109,6 +4153,7 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info, static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, struct btrfs_key *ins, struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info, bool full_search) { struct btrfs_root *root = fs_info->chunk_root; @@ -4163,7 +4208,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return ret; } - ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, + ret = btrfs_chunk_alloc(trans, space_info, ffe_ctl->flags, CHUNK_ALLOC_FORCE_FOR_EXTENT); /* Do not bail out on ENOSPC since we can do more. */ @@ -4200,21 +4245,6 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return -ENOSPC; } -static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl, - struct btrfs_block_group *bg) -{ - if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) - return true; - if (!btrfs_block_group_should_use_size_class(bg)) - return true; - if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) - return true; - if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && - bg->size_class == BTRFS_BG_SZ_NONE) - return true; - return ffe_ctl->size_class == bg->size_class; -} - static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, @@ -4380,11 +4410,22 @@ static noinline int find_free_extent(struct btrfs_root *root, ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(root, ffe_ctl); + trace_btrfs_find_free_extent(root, ffe_ctl); space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); + if (btrfs_is_zoned(fs_info) && space_info) { + /* Use dedicated sub-space_info for dedicated block group users. */ + if (ffe_ctl->for_data_reloc) { + space_info = space_info->sub_group[0]; + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); + } else if (ffe_ctl->for_treelog) { + space_info = space_info->sub_group[0]; + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_TREELOG); + } + } if (!space_info) { - btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags); + btrfs_err(fs_info, "no space info for %llu, tree-log %d, relocation %d", + ffe_ctl->flags, ffe_ctl->for_treelog, ffe_ctl->for_data_reloc); return -ENOSPC; } @@ -4406,6 +4447,7 @@ static noinline int find_free_extent(struct btrfs_root *root, * picked out then we don't care that the block group is cached. */ if (block_group && block_group_bits(block_group, ffe_ctl->flags) && + block_group->space_info == space_info && block_group->cached != BTRFS_CACHE_NO) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || @@ -4431,7 +4473,7 @@ static noinline int find_free_extent(struct btrfs_root *root, } } search: - trace_find_free_extent_search_loop(root, ffe_ctl); + trace_btrfs_find_free_extent_search_loop(root, ffe_ctl); ffe_ctl->have_caching_bg = false; if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || ffe_ctl->index == 0) @@ -4483,7 +4525,7 @@ search: } have_block_group: - trace_find_free_extent_have_block_group(root, ffe_ctl, block_group); + trace_btrfs_find_free_extent_have_block_group(root, ffe_ctl, block_group); ffe_ctl->cached = btrfs_block_group_done(block_group); if (unlikely(!ffe_ctl->cached)) { ffe_ctl->have_caching_bg = true; @@ -4576,7 +4618,8 @@ loop: } up_read(&space_info->groups_sem); - ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search); + ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, space_info, + full_search); if (ret > 0) goto search; @@ -4698,8 +4741,8 @@ again: return ret; } -int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len, int delalloc) +int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, + bool is_delalloc) { struct btrfs_block_group *cache; @@ -4711,7 +4754,7 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, } btrfs_add_free_space(cache, start, len); - btrfs_free_reserved_bytes(cache, len, delalloc); + btrfs_free_reserved_bytes(cache, len, is_delalloc); trace_btrfs_reserved_extent_free(fs_info, start, len); btrfs_put_block_group(cache); @@ -4742,7 +4785,7 @@ static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr, struct btrfs_fs_info *fs_info = trans->fs_info; int ret; - ret = remove_from_free_space_tree(trans, bytenr, num_bytes); + ret = btrfs_remove_from_free_space_tree(trans, bytenr, num_bytes); if (ret) return ret; @@ -4827,14 +4870,13 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); } - btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_free_path(path); return alloc_reserved_extent(trans, ins->objectid, ins->offset); } static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -4902,7 +4944,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root); } - btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); @@ -4923,7 +4964,7 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID); - if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root)) + if (btrfs_is_data_reloc_root(root) && btrfs_is_fstree(root->relocation_src_root)) generic_ref.owning_root = root->relocation_src_root; btrfs_init_data_ref(&generic_ref, owner, offset, 0, false); @@ -4945,7 +4986,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, int ret; struct btrfs_block_group *block_group; struct btrfs_space_info *space_info; - struct btrfs_squota_delta delta = { + const struct btrfs_squota_delta delta = { .root = root_objectid, .num_bytes = ins->offset, .generation = trans->transid, @@ -5071,17 +5112,17 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, * EXTENT bit to differentiate dirty pages. */ if (buf->log_index == 0) - set_extent_bit(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, - EXTENT_DIRTY, NULL); + btrfs_set_extent_bit(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, + EXTENT_DIRTY_LOG1, NULL); else - set_extent_bit(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, - EXTENT_NEW, NULL); + btrfs_set_extent_bit(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, + EXTENT_DIRTY_LOG2, NULL); } else { buf->log_index = -1; - set_extent_bit(&trans->transaction->dirty_pages, buf->start, - buf->start + buf->len - 1, EXTENT_DIRTY, NULL); + btrfs_set_extent_bit(&trans->transaction->dirty_pages, buf->start, + buf->start + buf->len - 1, EXTENT_DIRTY, NULL); } /* this returns a buffer locked for blocking */ return buf; @@ -5187,7 +5228,7 @@ out_free_buf: btrfs_tree_unlock(buf); free_extent_buffer(buf); out_free_reserved: - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, false); out_unuse: btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize); return ERR_PTR(ret); @@ -5285,7 +5326,7 @@ static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control * * reference to it. */ generation = btrfs_node_ptr_generation(eb, slot); - if (!wc->update_ref || generation <= root->root_key.offset) + if (!wc->update_ref || generation <= btrfs_root_origin_generation(root)) return false; /* @@ -5340,7 +5381,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, goto reada; if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) + generation <= btrfs_root_origin_generation(root)) continue; /* We don't lock the tree block, it's OK to be racy here */ @@ -5467,7 +5508,7 @@ static int check_ref_exists(struct btrfs_trans_handle *trans, { struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *head; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_extent_inline_ref *iref; int ret; bool exists = false; @@ -5484,7 +5525,6 @@ again: * If we get 0 then we found our reference, return 1, else * return the error if it's not -ENOENT; */ - btrfs_free_path(path); return (ret < 0 ) ? ret : 1; } @@ -5515,11 +5555,10 @@ again: goto again; } - exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent); + exists = btrfs_find_delayed_tree_ref(head, btrfs_root_id(root), parent); mutex_unlock(&head->mutex); out: spin_unlock(&delayed_refs->lock); - btrfs_free_path(path); return exists ? 1 : 0; } @@ -5683,7 +5722,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, * for the subtree */ if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) { + generation <= btrfs_root_origin_generation(root)) { wc->lookup_info = 1; return 1; } @@ -5836,15 +5875,20 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (wc->refs[level] == 1) { if (level == 0) { - if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { ret = btrfs_dec_ref(trans, root, eb, 1); - else + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } else { ret = btrfs_dec_ref(trans, root, eb, 0); - if (ret) { - btrfs_abort_transaction(trans, ret); - return ret; + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } } - if (is_fstree(btrfs_root_id(root))) { + if (btrfs_is_fstree(btrfs_root_id(root))) { ret = btrfs_qgroup_trace_leaf_items(trans, eb); if (ret) { btrfs_err_rl(fs_info, @@ -6287,7 +6331,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *parent) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct walk_control *wc; int level; int parent_level; @@ -6300,14 +6344,12 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return -ENOMEM; wc = kzalloc(sizeof(*wc), GFP_NOFS); - if (!wc) { - btrfs_free_path(path); + if (!wc) return -ENOMEM; - } btrfs_assert_tree_write_locked(parent); parent_level = btrfs_header_level(parent); - atomic_inc(&parent->refs); + refcount_inc(&parent->refs); path->nodes[parent_level] = parent; path->slots[parent_level] = btrfs_header_nritems(parent); @@ -6340,7 +6382,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, } kfree(wc); - btrfs_free_path(path); return ret; } @@ -6402,14 +6443,14 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) if (ret) break; - find_first_clear_extent_bit(&device->alloc_state, start, - &start, &end, - CHUNK_TRIMMED | CHUNK_ALLOCATED); + btrfs_find_first_clear_extent_bit(&device->alloc_state, start, + &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); /* Check if there are any CHUNK_* bits left */ if (start > device->total_bytes) { - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); - btrfs_warn_in_rcu(fs_info, + DEBUG_WARN(); + btrfs_warn(fs_info, "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu", start, end - start + 1, btrfs_dev_name(device), @@ -6441,8 +6482,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) ret = btrfs_issue_discard(device->bdev, start, len, &bytes); if (!ret) - set_extent_bit(&device->alloc_state, start, - start + bytes - 1, CHUNK_TRIMMED, NULL); + btrfs_set_extent_bit(&device->alloc_state, start, + start + bytes - 1, CHUNK_TRIMMED, NULL); mutex_unlock(&fs_info->chunk_mutex); if (ret) diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 2ad51130c037..82d3a82dc712 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -4,7 +4,6 @@ #define BTRFS_EXTENT_TREE_H #include <linux/types.h> -#include "misc.h" #include "block-group.h" #include "locking.h" @@ -98,7 +97,7 @@ enum btrfs_inline_ref_type { }; int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, - struct btrfs_extent_inline_ref *iref, + const struct btrfs_extent_inline_ref *iref, enum btrfs_inline_ref_type is_data); u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); @@ -116,8 +115,7 @@ int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, const struct extent_buffer *eb); int btrfs_exclude_logged_extents(struct extent_buffer *eb); -int btrfs_cross_ref_exist(struct btrfs_root *root, - u64 objectid, u64 offset, u64 bytenr, bool strict, +int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset, u64 bytenr, struct btrfs_path *path); struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -151,8 +149,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, int slot); -int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len, int delalloc); +int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, + bool is_delalloc); int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, const struct extent_buffer *eb); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); @@ -163,5 +161,9 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, struct extent_buffer *parent); +void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); +int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, + u64 num_bytes, u64 *actual_bytes); +int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); #endif diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b923d0cec61c..835b0deef9bb 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -75,9 +75,9 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) while (!list_empty(&fs_info->allocated_ebs)) { eb = list_first_entry(&fs_info->allocated_ebs, struct extent_buffer, leak_list); - pr_err( - "BTRFS: buffer leak start %llu len %u refs %d bflags %lu owner %llu\n", - eb->start, eb->len, atomic_read(&eb->refs), eb->bflags, + btrfs_err(fs_info, + "buffer leak start %llu len %u refs %d bflags %lu owner %llu", + eb->start, eb->len, refcount_read(&eb->refs), eb->bflags, btrfs_header_owner(eb)); list_del(&eb->leak_list); WARN_ON_ONCE(1); @@ -96,6 +96,8 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) */ struct btrfs_bio_ctrl { struct btrfs_bio *bbio; + /* Last byte contained in bbio + 1 . */ + loff_t next_file_offset; enum btrfs_compression_type compress_type; u32 len_to_oe_boundary; blk_opf_t opf; @@ -108,6 +110,7 @@ struct btrfs_bio_ctrl { * This is to avoid touching ranges covered by compression/inline. */ unsigned long submit_bitmap; + struct readahead_control *ractl; }; static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) @@ -198,9 +201,8 @@ static void __process_folios_contig(struct address_space *mapping, u64 end, unsigned long page_ops) { struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); - pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; - pgoff_t index = start_index; struct folio_batch fbatch; int i; @@ -221,36 +223,27 @@ static void __process_folios_contig(struct address_space *mapping, } } -static noinline void __unlock_for_delalloc(const struct inode *inode, - const struct folio *locked_folio, +static noinline void unlock_delalloc_folio(const struct inode *inode, + struct folio *locked_folio, u64 start, u64 end) { - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; - ASSERT(locked_folio); - if (index == locked_folio->index && end_index == index) - return; __process_folios_contig(inode->i_mapping, locked_folio, start, end, PAGE_UNLOCK); } static noinline int lock_delalloc_folios(struct inode *inode, - const struct folio *locked_folio, + struct folio *locked_folio, u64 start, u64 end) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct address_space *mapping = inode->i_mapping; - pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; - pgoff_t index = start_index; u64 processed_end = start; struct folio_batch fbatch; - if (index == locked_folio->index && index == end_index) - return 0; - folio_batch_init(&fbatch); while (index <= end_index) { unsigned int found_folios, i; @@ -274,8 +267,7 @@ static noinline int lock_delalloc_folios(struct inode *inode, goto out; } range_start = max_t(u64, folio_pos(folio), start); - range_len = min_t(u64, folio_pos(folio) + folio_size(folio), - end + 1) - range_start; + range_len = min_t(u64, folio_end(folio), end + 1) - range_start; btrfs_folio_set_lock(fs_info, folio, range_start, range_len); processed_end = range_start + range_len - 1; @@ -288,8 +280,7 @@ static noinline int lock_delalloc_folios(struct inode *inode, out: folio_batch_release(&fbatch); if (processed_end > start) - __unlock_for_delalloc(inode, locked_folio, start, - processed_end); + unlock_delalloc_folio(inode, locked_folio, start, processed_end); return -EAGAIN; } @@ -330,7 +321,7 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, ASSERT(orig_end > orig_start); /* The range should at least cover part of the folio */ - ASSERT(!(orig_start >= folio_pos(locked_folio) + folio_size(locked_folio) || + ASSERT(!(orig_start >= folio_end(locked_folio) || orig_end <= folio_pos(locked_folio))); again: /* step one, find a bunch of delalloc bytes starting at start */ @@ -343,7 +334,7 @@ again: /* @delalloc_end can be -1, never go beyond @orig_end */ *end = min(delalloc_end, orig_end); - free_extent_state(cached_state); + btrfs_free_extent_state(cached_state); return false; } @@ -369,7 +360,7 @@ again: /* some of the folios are gone, lets avoid looping by * shortening the size of the delalloc range we're searching */ - free_extent_state(cached_state); + btrfs_free_extent_state(cached_state); cached_state = NULL; if (!loops) { max_bytes = PAGE_SIZE; @@ -382,15 +373,15 @@ again: } /* step three, lock the state bits for the whole range */ - lock_extent(tree, delalloc_start, delalloc_end, &cached_state); + btrfs_lock_extent(tree, delalloc_start, delalloc_end, &cached_state); /* then test to make sure it is all still delalloc */ - ret = test_range_bit(tree, delalloc_start, delalloc_end, - EXTENT_DELALLOC, cached_state); + ret = btrfs_test_range_bit(tree, delalloc_start, delalloc_end, + EXTENT_DELALLOC, cached_state); - unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); + btrfs_unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); if (!ret) { - __unlock_for_delalloc(inode, locked_folio, delalloc_start, + unlock_delalloc_folio(inode, locked_folio, delalloc_start, delalloc_end); cond_resched(); goto again; @@ -406,7 +397,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached, u32 clear_bits, unsigned long page_ops) { - clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); + btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); __process_folios_contig(inode->vfs_inode.i_mapping, locked_folio, start, end, page_ops); @@ -428,14 +419,14 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); ASSERT(folio_pos(folio) <= start && - start + len <= folio_pos(folio) + PAGE_SIZE); + start + len <= folio_end(folio)); if (uptodate && btrfs_verify_folio(folio, start, len)) btrfs_folio_set_uptodate(fs_info, folio, start, len); else btrfs_folio_clear_uptodate(fs_info, folio, start, len); - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_is_subpage(fs_info, folio)) folio_unlock(folio); else btrfs_folio_end_lock(fs_info, folio, start, len); @@ -465,9 +456,6 @@ static void end_bbio_data_write(struct btrfs_bio *bbio) u64 start = folio_pos(folio) + fi.offset; u32 len = fi.length; - /* Only order 0 (single page) folios are allowed for data. */ - ASSERT(folio_order(folio) == 0); - /* Our read/write should always be sector aligned. */ if (!IS_ALIGNED(fi.offset, sectorsize)) btrfs_err(fs_info, @@ -491,11 +479,11 @@ static void end_bbio_data_write(struct btrfs_bio *bbio) static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) { ASSERT(folio_test_locked(folio)); - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_is_subpage(fs_info, folio)) return; ASSERT(folio_test_private(folio)); - btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE); + btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), folio_size(folio)); } /* @@ -515,47 +503,23 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) struct btrfs_fs_info *fs_info = bbio->fs_info; struct bio *bio = &bbio->bio; struct folio_iter fi; - const u32 sectorsize = fs_info->sectorsize; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_folio_all(fi, &bbio->bio) { bool uptodate = !bio->bi_status; struct folio *folio = fi.folio; struct inode *inode = folio->mapping->host; - u64 start; - u64 end; - u32 len; + u64 start = folio_pos(folio) + fi.offset; - /* For now only order 0 folios are supported for data. */ - ASSERT(folio_order(folio) == 0); btrfs_debug(fs_info, "%s: bi_sector=%llu, err=%d, mirror=%u", __func__, bio->bi_iter.bi_sector, bio->bi_status, bbio->mirror_num); - /* - * We always issue full-sector reads, but if some block in a - * folio fails to read, blk_update_request() will advance - * bv_offset and adjust bv_len to compensate. Print a warning - * for unaligned offsets, and an error if they don't add up to - * a full sector. - */ - if (!IS_ALIGNED(fi.offset, sectorsize)) - btrfs_err(fs_info, - "partial page read in btrfs with offset %zu and length %zu", - fi.offset, fi.length); - else if (!IS_ALIGNED(fi.offset + fi.length, sectorsize)) - btrfs_info(fs_info, - "incomplete page read with offset %zu and length %zu", - fi.offset, fi.length); - - start = folio_pos(folio) + fi.offset; - end = start + fi.length - 1; - len = fi.length; if (likely(uptodate)) { + u64 end = start + fi.length - 1; loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> folio_shift(folio); /* * Zero out the remaining part if this range straddles @@ -564,9 +528,11 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) * Here we should only zero the range inside the folio, * not touch anything else. * - * NOTE: i_size is exclusive while end is inclusive. + * NOTE: i_size is exclusive while end is inclusive and + * folio_contains() takes PAGE_SIZE units. */ - if (folio_index(folio) == end_index && i_size <= end) { + if (folio_contains(folio, i_size >> PAGE_SHIFT) && + i_size <= end) { u32 zero_start = max(offset_in_folio(folio, i_size), offset_in_folio(folio, start)); u32 zero_len = offset_in_folio(folio, end) + 1 - @@ -577,7 +543,7 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) } /* Update page status and unlock. */ - end_folio_read(folio, uptodate, start, len); + end_folio_read(folio, uptodate, start, fi.length); } bio_put(bio); } @@ -632,7 +598,7 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, for (allocated = 0; allocated < nr_pages;) { unsigned int last = allocated; - allocated = alloc_pages_bulk_array(gfp, nr_pages, page_array); + allocated = alloc_pages_bulk(gfp, nr_pages, page_array); if (unlikely(allocated == last)) { /* No progress, fail and do cleanup. */ for (int i = 0; i < allocated; i++) { @@ -668,13 +634,10 @@ static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail) } static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, - struct folio *folio, u64 disk_bytenr, - unsigned int pg_offset) + u64 disk_bytenr, loff_t file_offset) { struct bio *bio = &bio_ctrl->bbio->bio; - struct bio_vec *bvec = bio_last_bvec_all(bio); const sector_t sector = disk_bytenr >> SECTOR_SHIFT; - struct folio *bv_folio = page_folio(bvec->bv_page); if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { /* @@ -685,19 +648,11 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, } /* - * The contig check requires the following conditions to be met: - * - * 1) The folios are belonging to the same inode - * This is implied by the call chain. - * - * 2) The range has adjacent logical bytenr - * - * 3) The range has adjacent file offset - * This is required for the usage of btrfs_bio->file_offset. + * To merge into a bio both the disk sector and the logical offset in + * the file need to be contiguous. */ - return bio_end_sector(bio) == sector && - folio_pos(bv_folio) + bvec->bv_offset + bvec->bv_len == - folio_pos(folio) + pg_offset; + return bio_ctrl->next_file_offset == file_offset && + bio_end_sector(bio) == sector; } static void alloc_new_bio(struct btrfs_inode *inode, @@ -710,10 +665,12 @@ static void alloc_new_bio(struct btrfs_inode *inode, bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info, bio_ctrl->end_io_func, NULL); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; + bbio->bio.bi_write_hint = inode->vfs_inode.i_write_hint; bbio->inode = inode; bbio->file_offset = file_offset; bio_ctrl->bbio = bbio; bio_ctrl->len_to_oe_boundary = U32_MAX; + bio_ctrl->next_file_offset = file_offset; /* Limit data write bios to the ordered boundary. */ if (bio_ctrl->wbc) { @@ -755,22 +712,21 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, size_t size, unsigned long pg_offset) { struct btrfs_inode *inode = folio_to_inode(folio); + loff_t file_offset = folio_pos(folio) + pg_offset; - ASSERT(pg_offset + size <= PAGE_SIZE); + ASSERT(pg_offset + size <= folio_size(folio)); ASSERT(bio_ctrl->end_io_func); if (bio_ctrl->bbio && - !btrfs_bio_is_contig(bio_ctrl, folio, disk_bytenr, pg_offset)) + !btrfs_bio_is_contig(bio_ctrl, disk_bytenr, file_offset)) submit_one_bio(bio_ctrl); do { u32 len = size; /* Allocate new bio if needed */ - if (!bio_ctrl->bbio) { - alloc_new_bio(inode, bio_ctrl, disk_bytenr, - folio_pos(folio) + pg_offset); - } + if (!bio_ctrl->bbio) + alloc_new_bio(inode, bio_ctrl, disk_bytenr, file_offset); /* Cap to the current ordered extent boundary if there is one. */ if (len > bio_ctrl->len_to_oe_boundary) { @@ -784,14 +740,15 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, submit_one_bio(bio_ctrl); continue; } + bio_ctrl->next_file_offset += len; if (bio_ctrl->wbc) - wbc_account_cgroup_owner(bio_ctrl->wbc, folio, - len); + wbc_account_cgroup_owner(bio_ctrl->wbc, folio, len); size -= len; pg_offset += len; disk_bytenr += len; + file_offset += len; /* * len_to_oe_boundary defaults to U32_MAX, which isn't folio or @@ -825,7 +782,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, static int attach_extent_buffer_folio(struct extent_buffer *eb, struct folio *folio, - struct btrfs_subpage *prealloc) + struct btrfs_folio_state *prealloc) { struct btrfs_fs_info *fs_info = eb->fs_info; int ret = 0; @@ -839,7 +796,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb, if (folio->mapping) lockdep_assert_held(&folio->mapping->i_private_lock); - if (fs_info->nodesize >= PAGE_SIZE) { + if (!btrfs_meta_is_subpage(fs_info)) { if (!folio_test_private(folio)) folio_attach_private(folio, eb); else @@ -849,7 +806,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb, /* Already mapped, just free prealloc */ if (folio_test_private(folio)) { - btrfs_free_subpage(prealloc); + btrfs_free_folio_state(prealloc); return 0; } @@ -858,15 +815,10 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb, folio_attach_private(folio, prealloc); else /* Do new allocation to attach subpage */ - ret = btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); + ret = btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); return ret; } -int set_page_extent_mapped(struct page *page) -{ - return set_folio_extent_mapped(page_folio(page)); -} - int set_folio_extent_mapped(struct folio *folio) { struct btrfs_fs_info *fs_info; @@ -878,8 +830,8 @@ int set_folio_extent_mapped(struct folio *folio) fs_info = folio_to_fs_info(folio); - if (btrfs_is_subpage(fs_info, folio->mapping)) - return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA); + if (btrfs_is_subpage(fs_info, folio)) + return btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA); folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE); return 0; @@ -895,44 +847,60 @@ void clear_folio_extent_mapped(struct folio *folio) return; fs_info = folio_to_fs_info(folio); - if (btrfs_is_subpage(fs_info, folio->mapping)) - return btrfs_detach_subpage(fs_info, folio); + if (btrfs_is_subpage(fs_info, folio)) + return btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA); folio_detach_private(folio); } -static struct extent_map *__get_extent_map(struct inode *inode, - struct folio *folio, u64 start, - u64 len, struct extent_map **em_cached) +static struct extent_map *get_extent_map(struct btrfs_inode *inode, + struct folio *folio, u64 start, + u64 len, struct extent_map **em_cached) { struct extent_map *em; - struct extent_state *cached_state = NULL; ASSERT(em_cached); if (*em_cached) { em = *em_cached; - if (extent_map_in_tree(em) && start >= em->start && - start < extent_map_end(em)) { + if (btrfs_extent_map_in_tree(em) && start >= em->start && + start < btrfs_extent_map_end(em)) { refcount_inc(&em->refs); return em; } - free_extent_map(em); + btrfs_free_extent_map(em); *em_cached = NULL; } - btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), start, start + len - 1, &cached_state); - em = btrfs_get_extent(BTRFS_I(inode), folio, start, len); + em = btrfs_get_extent(inode, folio, start, len); if (!IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em; } - unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state); return em; } + +static void btrfs_readahead_expand(struct readahead_control *ractl, + const struct extent_map *em) +{ + const u64 ra_pos = readahead_pos(ractl); + const u64 ra_end = ra_pos + readahead_length(ractl); + const u64 em_end = em->start + em->ram_bytes; + + /* No expansion for holes and inline extents. */ + if (em->disk_bytenr > EXTENT_MAP_LAST_BYTE) + return; + + ASSERT(em_end >= ra_pos, + "extent_map %llu %llu ends before current readahead position %llu", + em->start, em->len, ra_pos); + if (em_end > ra_end) + readahead_expand(ractl, ra_pos, em_end - ra_pos); +} + /* * basic readpage implementation. Locked extent state structs are inserted * into the tree that are removed when the IO is done (by the end_io @@ -946,16 +914,12 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); u64 start = folio_pos(folio); - const u64 end = start + PAGE_SIZE - 1; - u64 cur = start; + const u64 end = start + folio_size(folio) - 1; u64 extent_offset; u64 last_byte = i_size_read(inode); - u64 block_start; struct extent_map *em; int ret = 0; - size_t pg_offset = 0; - size_t iosize; - size_t blocksize = fs_info->sectorsize; + const size_t blocksize = fs_info->sectorsize; ret = set_folio_extent_mapped(folio); if (ret < 0) { @@ -963,49 +927,62 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, return ret; } - if (folio->index == last_byte >> folio_shift(folio)) { + if (folio_contains(folio, last_byte >> PAGE_SHIFT)) { size_t zero_offset = offset_in_folio(folio, last_byte); - if (zero_offset) { - iosize = folio_size(folio) - zero_offset; - folio_zero_range(folio, zero_offset, iosize); - } + if (zero_offset) + folio_zero_range(folio, zero_offset, + folio_size(folio) - zero_offset); } bio_ctrl->end_io_func = end_bbio_data_read; begin_folio_read(fs_info, folio); - while (cur <= end) { + for (u64 cur = start; cur <= end; cur += blocksize) { enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; + unsigned long pg_offset = offset_in_folio(folio, cur); bool force_bio_submit = false; u64 disk_bytenr; + u64 block_start; ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { - iosize = folio_size(folio) - pg_offset; - folio_zero_range(folio, pg_offset, iosize); - end_folio_read(folio, true, cur, iosize); + folio_zero_range(folio, pg_offset, end - cur + 1); + end_folio_read(folio, true, cur, end - cur + 1); break; } - em = __get_extent_map(inode, folio, cur, end - cur + 1, - em_cached); + if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { + end_folio_read(folio, true, cur, blocksize); + continue; + } + em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached); if (IS_ERR(em)) { end_folio_read(folio, false, cur, end + 1 - cur); return PTR_ERR(em); } extent_offset = cur - em->start; - BUG_ON(extent_map_end(em) <= cur); + BUG_ON(btrfs_extent_map_end(em) <= cur); BUG_ON(end < cur); - compress_type = extent_map_compression(em); + compress_type = btrfs_extent_map_compression(em); + + /* + * Only expand readahead for extents which are already creating + * the pages anyway in add_ra_bio_pages, which is compressed + * extents in the non subpage case. + */ + if (bio_ctrl->ractl && + !btrfs_is_subpage(fs_info, folio) && + compress_type != BTRFS_COMPRESS_NONE) + btrfs_readahead_expand(bio_ctrl->ractl, em); - iosize = min(extent_map_end(em) - cur, end - cur + 1); - iosize = ALIGN(iosize, blocksize); if (compress_type != BTRFS_COMPRESS_NONE) disk_bytenr = em->disk_bytenr; else - disk_bytenr = extent_map_block_start(em) + extent_offset; - block_start = extent_map_block_start(em); + disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset; + if (em->flags & EXTENT_FLAG_PREALLOC) block_start = EXTENT_MAP_HOLE; + else + block_start = btrfs_extent_map_block_start(em); /* * If we have a file range that points to a compressed extent @@ -1049,23 +1026,18 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (prev_em_start) *prev_em_start = em->start; - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { - folio_zero_range(folio, pg_offset, iosize); - - end_folio_read(folio, true, cur, iosize); - cur = cur + iosize; - pg_offset += iosize; + folio_zero_range(folio, pg_offset, blocksize); + end_folio_read(folio, true, cur, blocksize); continue; } /* the get_extent function already copied into the folio */ if (block_start == EXTENT_MAP_INLINE) { - end_folio_read(folio, true, cur, iosize); - cur = cur + iosize; - pg_offset += iosize; + end_folio_read(folio, true, cur, blocksize); continue; } @@ -1076,23 +1048,205 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (force_bio_submit) submit_one_bio(bio_ctrl); - submit_extent_folio(bio_ctrl, disk_bytenr, folio, iosize, + submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize, pg_offset); - cur = cur + iosize; - pg_offset += iosize; } - return 0; } +/* + * Check if we can skip waiting the @ordered extent covering the block at @fileoff. + * + * @fileoff: Both input and output. + * Input as the file offset where the check should start at. + * Output as where the next check should start at, + * if the function returns true. + * + * Return true if we can skip to @fileoff. The caller needs to check the new + * @fileoff value to make sure it covers the full range, before skipping the + * full OE. + * + * Return false if we must wait for the ordered extent. + */ +static bool can_skip_one_ordered_range(struct btrfs_inode *inode, + struct btrfs_ordered_extent *ordered, + u64 *fileoff) +{ + const struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct folio *folio; + const u32 blocksize = fs_info->sectorsize; + u64 cur = *fileoff; + bool ret; + + folio = filemap_get_folio(inode->vfs_inode.i_mapping, cur >> PAGE_SHIFT); + + /* + * We should have locked the folio(s) for range [start, end], thus + * there must be a folio and it must be locked. + */ + ASSERT(!IS_ERR(folio)); + ASSERT(folio_test_locked(folio)); + + /* + * There are several cases for the folio and OE combination: + * + * 1) Folio has no private flag + * The OE has all its IO done but not yet finished, and folio got + * invalidated. + * + * Have we have to wait for the OE to finish, as it may contain the + * to-be-inserted data checksum. + * Without the data checksum inserted into the csum tree, read will + * just fail with missing csum. + */ + if (!folio_test_private(folio)) { + ret = false; + goto out; + } + + /* + * 2) The first block is DIRTY. + * + * This means the OE is created by some other folios whose file pos is + * before this one. And since we are holding the folio lock, the writeback + * of this folio cannot start. + * + * We must skip the whole OE, because it will never start until we + * finished our folio read and unlocked the folio. + */ + if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) { + u64 range_len = min(folio_end(folio), + ordered->file_offset + ordered->num_bytes) - cur; + + ret = true; + /* + * At least inside the folio, all the remaining blocks should + * also be dirty. + */ + ASSERT(btrfs_folio_test_dirty(fs_info, folio, cur, range_len)); + *fileoff = ordered->file_offset + ordered->num_bytes; + goto out; + } + + /* + * 3) The first block is uptodate. + * + * At least the first block can be skipped, but we are still not fully + * sure. E.g. if the OE has some other folios in the range that cannot + * be skipped. + * So we return true and update @next_ret to the OE/folio boundary. + */ + if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { + u64 range_len = min(folio_end(folio), + ordered->file_offset + ordered->num_bytes) - cur; + + /* + * The whole range to the OE end or folio boundary should also + * be uptodate. + */ + ASSERT(btrfs_folio_test_uptodate(fs_info, folio, cur, range_len)); + ret = true; + *fileoff = cur + range_len; + goto out; + } + + /* + * 4) The first block is not uptodate. + * + * This means the folio is invalidated after the writeback was finished, + * but by some other operations (e.g. block aligned buffered write) the + * folio is inserted into filemap. + * Very much the same as case 1). + */ + ret = false; +out: + folio_put(folio); + return ret; +} + +static bool can_skip_ordered_extent(struct btrfs_inode *inode, + struct btrfs_ordered_extent *ordered, + u64 start, u64 end) +{ + const u64 range_end = min(end, ordered->file_offset + ordered->num_bytes - 1); + u64 cur = max(start, ordered->file_offset); + + while (cur < range_end) { + bool can_skip; + + can_skip = can_skip_one_ordered_range(inode, ordered, &cur); + if (!can_skip) + return false; + } + return true; +} + +/* + * Locking helper to make sure we get a stable view of extent maps for the + * involved range. + * + * This is for folio read paths (read and readahead), thus the involved range + * should have all the folios locked. + */ +static void lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state) +{ + u64 cur_pos; + + /* Caller must provide a valid @cached_state. */ + ASSERT(cached_state); + + /* The range must at least be page aligned, as all read paths are folio based. */ + ASSERT(IS_ALIGNED(start, PAGE_SIZE)); + ASSERT(IS_ALIGNED(end + 1, PAGE_SIZE)); + +again: + btrfs_lock_extent(&inode->io_tree, start, end, cached_state); + cur_pos = start; + while (cur_pos < end) { + struct btrfs_ordered_extent *ordered; + + ordered = btrfs_lookup_ordered_range(inode, cur_pos, + end - cur_pos + 1); + /* + * No ordered extents in the range, and we hold the extent lock, + * no one can modify the extent maps in the range, we're safe to return. + */ + if (!ordered) + break; + + /* Check if we can skip waiting for the whole OE. */ + if (can_skip_ordered_extent(inode, ordered, start, end)) { + cur_pos = min(ordered->file_offset + ordered->num_bytes, + end + 1); + btrfs_put_ordered_extent(ordered); + continue; + } + + /* Now wait for the OE to finish. */ + btrfs_unlock_extent(&inode->io_tree, start, end, cached_state); + btrfs_start_ordered_extent_nowriteback(ordered, start, end + 1 - start); + btrfs_put_ordered_extent(ordered); + /* We have unlocked the whole range, restart from the beginning. */ + goto again; + } +} + int btrfs_read_folio(struct file *file, struct folio *folio) { + struct btrfs_inode *inode = folio_to_inode(folio); + const u64 start = folio_pos(folio); + const u64 end = start + folio_size(folio) - 1; + struct extent_state *cached_state = NULL; struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; struct extent_map *em_cached = NULL; int ret; + lock_extents_for_read(inode, start, end, &cached_state); ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); - free_extent_map(em_cached); + btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); + + btrfs_free_extent_map(em_cached); /* * If btrfs_do_readpage() failed we will want to submit the assembled @@ -1110,7 +1264,7 @@ static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bit unsigned int start_bit; unsigned int nbits; - ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE); + ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio)); start_bit = (start - folio_start) >> fs_info->sectorsize_bits; nbits = len >> fs_info->sectorsize_bits; ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits)); @@ -1123,12 +1277,12 @@ static bool find_next_delalloc_bitmap(struct folio *folio, { struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); const u64 folio_start = folio_pos(folio); - const unsigned int bitmap_size = fs_info->sectors_per_page; + const unsigned int bitmap_size = btrfs_blocks_per_folio(fs_info, folio); unsigned int start_bit; unsigned int first_zero; unsigned int first_set; - ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE); + ASSERT(start >= folio_start && start < folio_start + folio_size(folio)); start_bit = (start - folio_start) >> fs_info->sectorsize_bits; first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit); @@ -1142,14 +1296,19 @@ static bool find_next_delalloc_bitmap(struct folio *folio, } /* - * helper for extent_writepage(), doing all of the delayed allocation setup. + * Do all of the delayed allocation setup. * - * This returns 1 if btrfs_run_delalloc_range function did all the work required - * to write the page (copy into inline extent). In this case the IO has - * been started and the page is already unlocked. + * Return >0 if all the dirty blocks are submitted async (compression) or inlined. + * The @folio should no longer be touched (treat it as already unlocked). * - * This returns 0 if all went well (page still locked) - * This returns < 0 if there were errors (page still locked) + * Return 0 if there is still dirty block that needs to be submitted through + * extent_writepage_io(). + * bio_ctrl->submit_bitmap will indicate which blocks of the folio should be + * submitted, and @folio is still kept locked. + * + * Return <0 if there is any error hit. + * Any allocated ordered extent range covering this folio will be marked + * finished (IOERR), and @folio is still kept locked. */ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, struct folio *folio, @@ -1157,9 +1316,10 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, { struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode); struct writeback_control *wbc = bio_ctrl->wbc; - const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping); + const bool is_subpage = btrfs_is_subpage(fs_info, folio); const u64 page_start = folio_pos(folio); const u64 page_end = page_start + folio_size(folio) - 1; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); unsigned long delalloc_bitmap = 0; /* * Save the last found delalloc end. As the delalloc end can go beyond @@ -1167,6 +1327,16 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, * last delalloc end. */ u64 last_delalloc_end = 0; + /* + * The range end (exclusive) of the last successfully finished delalloc + * range. + * Any range covered by ordered extent must either be manually marked + * finished (error handling), or has IO submitted (and finish the + * ordered extent normally). + * + * This records the end of ordered extent cleanup if we hit an error. + */ + u64 last_finished_delalloc_end = page_start; u64 delalloc_start = page_start; u64 delalloc_end = page_end; u64 delalloc_to_write = 0; @@ -1174,14 +1344,14 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, int bit; /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ - if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { - ASSERT(fs_info->sectors_per_page > 1); + if (btrfs_is_subpage(fs_info, folio)) { + ASSERT(blocks_per_folio > 1); btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap); } else { bio_ctrl->submit_bitmap = 1; } - for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) { u64 start = page_start + (bit << fs_info->sectorsize_bits); btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize); @@ -1235,19 +1405,36 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, found_len = last_delalloc_end + 1 - found_start; if (ret >= 0) { + /* + * Some delalloc range may be created by previous folios. + * Thus we still need to clean up this range during error + * handling. + */ + last_finished_delalloc_end = found_start; /* No errors hit so far, run the current delalloc range. */ ret = btrfs_run_delalloc_range(inode, folio, found_start, found_start + found_len - 1, wbc); + if (ret >= 0) + last_finished_delalloc_end = found_start + found_len; + if (unlikely(ret < 0)) + btrfs_err_rl(fs_info, +"failed to run delalloc range, root=%lld ino=%llu folio=%llu submit_bitmap=%*pbl start=%llu len=%u: %d", + btrfs_root_id(inode->root), + btrfs_ino(inode), + folio_pos(folio), + blocks_per_folio, + &bio_ctrl->submit_bitmap, + found_start, found_len, ret); } else { /* * We've hit an error during previous delalloc range, * have to cleanup the remaining locked ranges. */ - unlock_extent(&inode->io_tree, found_start, - found_start + found_len - 1, NULL); - __unlock_for_delalloc(&inode->vfs_inode, folio, + btrfs_unlock_extent(&inode->io_tree, found_start, + found_start + found_len - 1, NULL); + unlock_delalloc_folio(&inode->vfs_inode, folio, found_start, found_start + found_len - 1); } @@ -1274,8 +1461,22 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, delalloc_start = found_start + found_len; } - if (ret < 0) + /* + * It's possible we had some ordered extents created before we hit + * an error, cleanup non-async successfully created delalloc ranges. + */ + if (unlikely(ret < 0)) { + unsigned int bitmap_size = min( + (last_finished_delalloc_end - page_start) >> + fs_info->sectorsize_bits, + blocks_per_folio); + + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size) + btrfs_mark_ordered_io_finished(inode, folio, + page_start + (bit << fs_info->sectorsize_bits), + fs_info->sectorsize, false); return ret; + } out: if (last_delalloc_end) delalloc_end = last_delalloc_end; @@ -1283,7 +1484,7 @@ out: delalloc_end = page_end; /* * delalloc_end is already one less than the total length, so - * we don't subtract one from PAGE_SIZE + * we don't subtract one from PAGE_SIZE. */ delalloc_to_write += DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); @@ -1292,7 +1493,7 @@ out: * If all ranges are submitted asynchronously, we just need to account * for them here. */ - if (bitmap_empty(&bio_ctrl->submit_bitmap, fs_info->sectors_per_page)) { + if (bitmap_empty(&bio_ctrl->submit_bitmap, blocks_per_folio)) { wbc->nr_to_write -= delalloc_to_write; return 1; } @@ -1335,22 +1536,22 @@ static int submit_one_sector(struct btrfs_inode *inode, em = btrfs_get_extent(inode, NULL, filepos, sectorsize); if (IS_ERR(em)) - return PTR_ERR_OR_ZERO(em); + return PTR_ERR(em); extent_offset = filepos - em->start; - em_end = extent_map_end(em); + em_end = btrfs_extent_map_end(em); ASSERT(filepos <= em_end); ASSERT(IS_ALIGNED(em->start, sectorsize)); ASSERT(IS_ALIGNED(em->len, sectorsize)); - block_start = extent_map_block_start(em); - disk_bytenr = extent_map_block_start(em) + extent_offset; + block_start = btrfs_extent_map_block_start(em); + disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset; - ASSERT(!extent_map_is_compressed(em)); + ASSERT(!btrfs_extent_map_is_compressed(em)); ASSERT(block_start != EXTENT_MAP_HOLE); ASSERT(block_start != EXTENT_MAP_INLINE); - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; /* @@ -1391,7 +1592,9 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long range_bitmap = 0; bool submitted_io = false; + bool error = false; const u64 folio_start = folio_pos(folio); + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); u64 cur; int bit; int ret = 0; @@ -1400,21 +1603,23 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, start + len <= folio_start + folio_size(folio)); ret = btrfs_writepage_cow_fixup(folio); - if (ret) { + if (ret == -EAGAIN) { /* Fixup worker will requeue */ folio_redirty_for_writepage(bio_ctrl->wbc, folio); folio_unlock(folio); return 1; } + if (ret < 0) + return ret; for (cur = start; cur < start + len; cur += fs_info->sectorsize) set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap, - fs_info->sectors_per_page); + blocks_per_folio); bio_ctrl->end_io_func = end_bbio_data_write; - for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) { cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits); if (cur >= i_size) { @@ -1433,11 +1638,26 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, break; } ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); - if (ret < 0) - goto out; + if (unlikely(ret < 0)) { + /* + * bio_ctrl may contain a bio crossing several folios. + * Submit it immediately so that the bio has a chance + * to finish normally, other than marked as error. + */ + submit_one_bio(bio_ctrl); + /* + * Failed to grab the extent map which should be very rare. + * Since there is no bio submitted to finish the ordered + * extent, we have to manually finish this sector. + */ + btrfs_mark_ordered_io_finished(inode, folio, cur, + fs_info->sectorsize, false); + error = true; + continue; + } submitted_io = true; } -out: + /* * If we didn't submitted any sector (>= i_size), folio dirty get * cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared @@ -1445,8 +1665,11 @@ out: * * Here we set writeback and clear for the range. If the full folio * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag. + * + * If we hit any error, the corresponding sector will still be dirty + * thus no need to clear PAGECACHE_TAG_DIRTY. */ - if (!submitted_io) { + if (!submitted_io && !error) { btrfs_folio_set_writeback(fs_info, folio, start, len); btrfs_folio_clear_writeback(fs_info, folio, start, len); } @@ -1464,15 +1687,15 @@ out: */ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl) { - struct inode *inode = folio->mapping->host; - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - const u64 page_start = folio_pos(folio); + struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); + struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret; size_t pg_offset; - loff_t i_size = i_size_read(inode); - unsigned long end_index = i_size >> PAGE_SHIFT; + loff_t i_size = i_size_read(&inode->vfs_inode); + const pgoff_t end_index = i_size >> PAGE_SHIFT; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); - trace_extent_writepage(folio, inode, bio_ctrl->wbc); + trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc); WARN_ON(!folio_test_locked(folio)); @@ -1484,7 +1707,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl return 0; } - if (folio->index == end_index) + if (folio_contains(folio, end_index)) folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset); /* @@ -1492,30 +1715,56 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl * The proper bitmap can only be initialized until writepage_delalloc(). */ bio_ctrl->submit_bitmap = (unsigned long)-1; + + /* + * If the page is dirty but without private set, it's marked dirty + * without informing the fs. + * Nowadays that is a bug, since the introduction of + * pin_user_pages*(). + * + * So here we check if the page has private set to rule out such + * case. + * But we also have a long history of relying on the COW fixup, + * so here we only enable this check for experimental builds until + * we're sure it's safe. + */ + if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && + unlikely(!folio_test_private(folio))) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_err_rl(fs_info, + "root %lld ino %llu folio %llu is marked dirty without notifying the fs", + btrfs_root_id(inode->root), + btrfs_ino(inode), folio_pos(folio)); + ret = -EUCLEAN; + goto done; + } + ret = set_folio_extent_mapped(folio); if (ret < 0) goto done; - ret = writepage_delalloc(BTRFS_I(inode), folio, bio_ctrl); + ret = writepage_delalloc(inode, folio, bio_ctrl); if (ret == 1) return 0; if (ret) goto done; - ret = extent_writepage_io(BTRFS_I(inode), folio, folio_pos(folio), - PAGE_SIZE, bio_ctrl, i_size); + ret = extent_writepage_io(inode, folio, folio_pos(folio), + folio_size(folio), bio_ctrl, i_size); if (ret == 1) return 0; + if (ret < 0) + btrfs_err_rl(fs_info, +"failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d", + btrfs_root_id(inode->root), btrfs_ino(inode), + folio_pos(folio), blocks_per_folio, + &bio_ctrl->submit_bitmap, ret); bio_ctrl->wbc->nr_to_write--; done: - if (ret) { - btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio, - page_start, PAGE_SIZE, !ret); + if (ret < 0) mapping_set_error(folio->mapping, ret); - } - /* * Only unlock ranges that are submitted. As there can be some async * submitted ranges inside the folio. @@ -1525,12 +1774,6 @@ done: return ret; } -void wait_on_extent_buffer_writeback(struct extent_buffer *eb) -{ - wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, - TASK_UNINTERRUPTIBLE); -} - /* * Lock extent buffer status and pages for writeback. * @@ -1560,8 +1803,18 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e */ spin_lock(&eb->refs_lock); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); + unsigned long flags; + set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); spin_unlock(&eb->refs_lock); + + xas_lock_irqsave(&xas, flags); + xas_load(&xas); + xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); + xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); + xas_unlock_irqrestore(&xas, flags); + btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, @@ -1647,50 +1900,167 @@ static void set_btree_ioerr(struct extent_buffer *eb) } } +static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); + unsigned long flags; + + xas_lock_irqsave(&xas, flags); + xas_load(&xas); + xas_set_mark(&xas, mark); + xas_unlock_irqrestore(&xas, flags); +} + +static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); + unsigned long flags; + + xas_lock_irqsave(&xas, flags); + xas_load(&xas); + xas_clear_mark(&xas, mark); + xas_unlock_irqrestore(&xas, flags); +} + +static void buffer_tree_tag_for_writeback(struct btrfs_fs_info *fs_info, + unsigned long start, unsigned long end) +{ + XA_STATE(xas, &fs_info->buffer_tree, start); + unsigned int tagged = 0; + void *eb; + + xas_lock_irq(&xas); + xas_for_each_marked(&xas, eb, end, PAGECACHE_TAG_DIRTY) { + xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); + if (++tagged % XA_CHECK_SCHED) + continue; + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); + } + xas_unlock_irq(&xas); +} + +struct eb_batch { + unsigned int nr; + unsigned int cur; + struct extent_buffer *ebs[PAGEVEC_SIZE]; +}; + +static inline bool eb_batch_add(struct eb_batch *batch, struct extent_buffer *eb) +{ + batch->ebs[batch->nr++] = eb; + return (batch->nr < PAGEVEC_SIZE); +} + +static inline void eb_batch_init(struct eb_batch *batch) +{ + batch->nr = 0; + batch->cur = 0; +} + +static inline struct extent_buffer *eb_batch_next(struct eb_batch *batch) +{ + if (batch->cur >= batch->nr) + return NULL; + return batch->ebs[batch->cur++]; +} + +static inline void eb_batch_release(struct eb_batch *batch) +{ + for (unsigned int i = 0; i < batch->nr; i++) + free_extent_buffer(batch->ebs[i]); + eb_batch_init(batch); +} + +static inline struct extent_buffer *find_get_eb(struct xa_state *xas, unsigned long max, + xa_mark_t mark) +{ + struct extent_buffer *eb; + +retry: + eb = xas_find_marked(xas, max, mark); + + if (xas_retry(xas, eb)) + goto retry; + + if (!eb) + return NULL; + + if (!refcount_inc_not_zero(&eb->refs)) { + xas_reset(xas); + goto retry; + } + + if (unlikely(eb != xas_reload(xas))) { + free_extent_buffer(eb); + xas_reset(xas); + goto retry; + } + + return eb; +} + +static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info, + unsigned long *start, + unsigned long end, xa_mark_t tag, + struct eb_batch *batch) +{ + XA_STATE(xas, &fs_info->buffer_tree, *start); + struct extent_buffer *eb; + + rcu_read_lock(); + while ((eb = find_get_eb(&xas, end, tag)) != NULL) { + if (!eb_batch_add(batch, eb)) { + *start = ((eb->start + eb->len) >> fs_info->nodesize_bits); + goto out; + } + } + if (end == ULONG_MAX) + *start = ULONG_MAX; + else + *start = end + 1; +out: + rcu_read_unlock(); + + return batch->nr; +} + /* * The endio specific version which won't touch any unsafe spinlock in endio * context. */ static struct extent_buffer *find_extent_buffer_nolock( - const struct btrfs_fs_info *fs_info, u64 start) + struct btrfs_fs_info *fs_info, u64 start) { struct extent_buffer *eb; + unsigned long index = (start >> fs_info->nodesize_bits); rcu_read_lock(); - eb = radix_tree_lookup(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits); - if (eb && atomic_inc_not_zero(&eb->refs)) { - rcu_read_unlock(); - return eb; - } + eb = xa_load(&fs_info->buffer_tree, index); + if (eb && !refcount_inc_not_zero(&eb->refs)) + eb = NULL; rcu_read_unlock(); - return NULL; + return eb; } static void end_bbio_meta_write(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; - struct btrfs_fs_info *fs_info = eb->fs_info; - bool uptodate = !bbio->bio.bi_status; struct folio_iter fi; - u32 bio_offset = 0; - if (!uptodate) + if (bbio->bio.bi_status != BLK_STS_OK) set_btree_ioerr(eb); bio_for_each_folio_all(fi, &bbio->bio) { - u64 start = eb->start + bio_offset; - struct folio *folio = fi.folio; - u32 len = fi.length; - - btrfs_folio_clear_writeback(fs_info, folio, start, len); - bio_offset += len; + btrfs_meta_folio_clear_writeback(fi.folio, eb); } - clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); - smp_mb__after_atomic(); - wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); - + buffer_tree_clear_mark(eb, PAGECACHE_TAG_WRITEBACK); + clear_and_wake_up_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); bio_put(&bbio->bio); } @@ -1738,199 +2108,56 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, wbc_init_bio(wbc, &bbio->bio); bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; - if (fs_info->nodesize < PAGE_SIZE) { - struct folio *folio = eb->folios[0]; - bool ret; + for (int i = 0; i < num_extent_folios(eb); i++) { + struct folio *folio = eb->folios[i]; + u64 range_start = max_t(u64, eb->start, folio_pos(folio)); + u32 range_len = min_t(u64, folio_end(folio), + eb->start + eb->len) - range_start; folio_lock(folio); - btrfs_subpage_set_writeback(fs_info, folio, eb->start, eb->len); - if (btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, - eb->len)) { - folio_clear_dirty_for_io(folio); - wbc->nr_to_write--; - } - ret = bio_add_folio(&bbio->bio, folio, eb->len, - eb->start - folio_pos(folio)); - ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio, eb->len); - folio_unlock(folio); - } else { - int num_folios = num_extent_folios(eb); - - for (int i = 0; i < num_folios; i++) { - struct folio *folio = eb->folios[i]; - bool ret; - - folio_lock(folio); - folio_clear_dirty_for_io(folio); - folio_start_writeback(folio); - ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); - ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio, eb->folio_size); + btrfs_meta_folio_clear_dirty(folio, eb); + btrfs_meta_folio_set_writeback(folio, eb); + if (!folio_test_dirty(folio)) wbc->nr_to_write -= folio_nr_pages(folio); - folio_unlock(folio); - } + bio_add_folio_nofail(&bbio->bio, folio, range_len, + offset_in_folio(folio, range_start)); + wbc_account_cgroup_owner(wbc, folio, range_len); + folio_unlock(folio); } btrfs_submit_bbio(bbio, 0); } /* - * Submit one subpage btree page. - * - * The main difference to submit_eb_page() is: - * - Page locking - * For subpage, we don't rely on page locking at all. - * - * - Flush write bio - * We only flush bio if we may be unable to fit current extent buffers into - * current bio. + * Wait for all eb writeback in the given range to finish. * - * Return >=0 for the number of submitted extent buffers. - * Return <0 for fatal error. + * @fs_info: The fs_info for this file system. + * @start: The offset of the range to start waiting on writeback. + * @end: The end of the range, inclusive. This is meant to be used in + * conjuction with wait_marked_extents, so this will usually be + * the_next_eb->start - 1. */ -static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc) +void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, + u64 end) { - struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); - int submitted = 0; - u64 folio_start = folio_pos(folio); - int bit_start = 0; - int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; - - /* Lock and write each dirty extent buffers in the range */ - while (bit_start < fs_info->sectors_per_page) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct eb_batch batch; + unsigned long start_index = (start >> fs_info->nodesize_bits); + unsigned long end_index = (end >> fs_info->nodesize_bits); + + eb_batch_init(&batch); + while (start_index <= end_index) { struct extent_buffer *eb; - unsigned long flags; - u64 start; + unsigned int nr_ebs; - /* - * Take private lock to ensure the subpage won't be detached - * in the meantime. - */ - spin_lock(&folio->mapping->i_private_lock); - if (!folio_test_private(folio)) { - spin_unlock(&folio->mapping->i_private_lock); + nr_ebs = buffer_tree_get_ebs_tag(fs_info, &start_index, end_index, + PAGECACHE_TAG_WRITEBACK, &batch); + if (!nr_ebs) break; - } - spin_lock_irqsave(&subpage->lock, flags); - if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * fs_info->sectors_per_page, - subpage->bitmaps)) { - spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&folio->mapping->i_private_lock); - bit_start++; - continue; - } - - start = folio_start + bit_start * fs_info->sectorsize; - bit_start += sectors_per_node; - /* - * Here we just want to grab the eb without touching extra - * spin locks, so call find_extent_buffer_nolock(). - */ - eb = find_extent_buffer_nolock(fs_info, start); - spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&folio->mapping->i_private_lock); - - /* - * The eb has already reached 0 refs thus find_extent_buffer() - * doesn't return it. We don't need to write back such eb - * anyway. - */ - if (!eb) - continue; - - if (lock_extent_buffer_for_io(eb, wbc)) { - write_one_eb(eb, wbc); - submitted++; - } - free_extent_buffer(eb); - } - return submitted; -} - -/* - * Submit all page(s) of one extent buffer. - * - * @page: the page of one extent buffer - * @eb_context: to determine if we need to submit this page, if current page - * belongs to this eb, we don't need to submit - * - * The caller should pass each page in their bytenr order, and here we use - * @eb_context to determine if we have submitted pages of one extent buffer. - * - * If we have, we just skip until we hit a new page that doesn't belong to - * current @eb_context. - * - * If not, we submit all the page(s) of the extent buffer. - * - * Return >0 if we have submitted the extent buffer successfully. - * Return 0 if we don't need to submit the page, as it's already submitted by - * previous call. - * Return <0 for fatal error. - */ -static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ctx) -{ - struct writeback_control *wbc = ctx->wbc; - struct address_space *mapping = folio->mapping; - struct extent_buffer *eb; - int ret; - - if (!folio_test_private(folio)) - return 0; - - if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) - return submit_eb_subpage(folio, wbc); - - spin_lock(&mapping->i_private_lock); - if (!folio_test_private(folio)) { - spin_unlock(&mapping->i_private_lock); - return 0; - } - - eb = folio_get_private(folio); - - /* - * Shouldn't happen and normally this would be a BUG_ON but no point - * crashing the machine for something we can survive anyway. - */ - if (WARN_ON(!eb)) { - spin_unlock(&mapping->i_private_lock); - return 0; - } - - if (eb == ctx->eb) { - spin_unlock(&mapping->i_private_lock); - return 0; - } - ret = atomic_inc_not_zero(&eb->refs); - spin_unlock(&mapping->i_private_lock); - if (!ret) - return 0; - - ctx->eb = eb; - - ret = btrfs_check_meta_write_pointer(eb->fs_info, ctx); - if (ret) { - if (ret == -EBUSY) - ret = 0; - free_extent_buffer(eb); - return ret; - } - - if (!lock_extent_buffer_for_io(eb, wbc)) { - free_extent_buffer(eb); - return 0; - } - /* Implies write in zoned mode. */ - if (ctx->zoned_bg) { - /* Mark the last eb in the block group. */ - btrfs_schedule_zone_finish_bg(ctx->zoned_bg, eb); - ctx->zoned_bg->meta_write_pointer += eb->len; + while ((eb = eb_batch_next(&batch)) != NULL) + wait_on_extent_buffer_writeback(eb); + eb_batch_release(&batch); + cond_resched(); } - write_one_eb(eb, wbc); - free_extent_buffer(eb); - return 1; } int btree_write_cache_pages(struct address_space *mapping, @@ -1941,25 +2168,27 @@ int btree_write_cache_pages(struct address_space *mapping, int ret = 0; int done = 0; int nr_to_write_done = 0; - struct folio_batch fbatch; - unsigned int nr_folios; - pgoff_t index; - pgoff_t end; /* Inclusive */ + struct eb_batch batch; + unsigned int nr_ebs; + unsigned long index; + unsigned long end; int scanned = 0; xa_mark_t tag; - folio_batch_init(&fbatch); + eb_batch_init(&batch); if (wbc->range_cyclic) { - index = mapping->writeback_index; /* Start from prev offset */ + index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->nodesize_bits); end = -1; + /* * Start from the beginning does not need to cycle over the * range, mark it as scanned. */ scanned = (index == 0); } else { - index = wbc->range_start >> PAGE_SHIFT; - end = wbc->range_end >> PAGE_SHIFT; + index = (wbc->range_start >> fs_info->nodesize_bits); + end = (wbc->range_end >> fs_info->nodesize_bits); + scanned = 1; } if (wbc->sync_mode == WB_SYNC_ALL) @@ -1969,31 +2198,39 @@ int btree_write_cache_pages(struct address_space *mapping, btrfs_zoned_meta_io_lock(fs_info); retry: if (wbc->sync_mode == WB_SYNC_ALL) - tag_pages_for_writeback(mapping, index, end); + buffer_tree_tag_for_writeback(fs_info, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_folios = filemap_get_folios_tag(mapping, &index, end, - tag, &fbatch))) { - unsigned i; + (nr_ebs = buffer_tree_get_ebs_tag(fs_info, &index, end, tag, &batch))) { + struct extent_buffer *eb; - for (i = 0; i < nr_folios; i++) { - struct folio *folio = fbatch.folios[i]; + while ((eb = eb_batch_next(&batch)) != NULL) { + ctx.eb = eb; + + ret = btrfs_check_meta_write_pointer(eb->fs_info, &ctx); + if (ret) { + if (ret == -EBUSY) + ret = 0; - ret = submit_eb_page(folio, &ctx); - if (ret == 0) + if (ret) { + done = 1; + break; + } continue; - if (ret < 0) { - done = 1; - break; } - /* - * the filesystem may choose to bump up nr_to_write. - * We have to make sure to honor the new nr_to_write - * at any time - */ - nr_to_write_done = wbc->nr_to_write <= 0; + if (!lock_extent_buffer_for_io(eb, wbc)) + continue; + + /* Implies write in zoned mode. */ + if (ctx.zoned_bg) { + /* Mark the last eb in the block group. */ + btrfs_schedule_zone_finish_bg(ctx.zoned_bg, eb); + ctx.zoned_bg->meta_write_pointer += eb->len; + } + write_one_eb(eb, wbc); } - folio_batch_release(&fbatch); + nr_to_write_done = (wbc->nr_to_write <= 0); + eb_batch_release(&batch); cond_resched(); } if (!scanned && !done) { @@ -2138,10 +2375,8 @@ retry: done_index = folio_next_index(folio); /* * At this point we hold neither the i_pages lock nor - * the page lock: the page may be truncated or - * invalidated (changing page->mapping to NULL), - * or even swizzled back from swapper_space to - * tmpfs file mapping + * the folio lock: the folio may be truncated or + * invalidated (changing folio->mapping to NULL). */ if (!folio_trylock(folio)) { submit_write_bio(bio_ctrl, 0); @@ -2179,7 +2414,7 @@ retry: * regular submission. */ if (wbc->sync_mode != WB_SYNC_NONE || - btrfs_is_subpage(inode_to_fs_info(inode), mapping)) { + btrfs_is_subpage(inode_to_fs_info(inode), folio)) { if (folio_test_writeback(folio)) submit_write_bio(bio_ctrl, 0); folio_wait_writeback(folio); @@ -2260,8 +2495,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); while (cur <= end) { - u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); - u32 cur_len = cur_end + 1 - cur; + u64 cur_end; + u32 cur_len; struct folio *folio; folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT); @@ -2271,13 +2506,18 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f * code is just in case, but shouldn't actually be run. */ if (IS_ERR(folio)) { + cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); + cur_len = cur_end + 1 - cur; btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, cur, cur_len, false); mapping_set_error(mapping, PTR_ERR(folio)); - cur = cur_end + 1; + cur = cur_end; continue; } + cur_end = min_t(u64, folio_end(folio) - 1, end); + cur_len = cur_end + 1 - cur; + ASSERT(folio_test_locked(folio)); if (pages_dirty && folio != locked_folio) ASSERT(folio_test_dirty(folio)); @@ -2292,11 +2532,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f if (ret == 1) goto next_page; - if (ret) { - btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio, - cur, cur_len, !ret); + if (ret) mapping_set_error(mapping, ret); - } btrfs_folio_end_lock(fs_info, folio, cur, cur_len); if (ret < 0) found_error = true; @@ -2330,16 +2567,27 @@ int btrfs_writepages(struct address_space *mapping, struct writeback_control *wb void btrfs_readahead(struct readahead_control *rac) { - struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; + struct btrfs_bio_ctrl bio_ctrl = { + .opf = REQ_OP_READ | REQ_RAHEAD, + .ractl = rac + }; struct folio *folio; + struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); + const u64 start = readahead_pos(rac); + const u64 end = start + readahead_length(rac) - 1; + struct extent_state *cached_state = NULL; struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; + lock_extents_for_read(inode, start, end, &cached_state); + while ((folio = readahead_folio(rac)) != NULL) btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); + btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); + if (em_cached) - free_extent_map(em_cached); + btrfs_free_extent_map(em_cached); submit_one_bio(&bio_ctrl); } @@ -2363,7 +2611,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree, if (start > end) return 0; - lock_extent(tree, start, end, &cached_state); + btrfs_lock_extent(tree, start, end, &cached_state); folio_wait_writeback(folio); /* @@ -2371,46 +2619,54 @@ int extent_invalidate_folio(struct extent_io_tree *tree, * so here we only need to unlock the extent range to free any * existing extent state. */ - unlock_extent(tree, start, end, &cached_state); + btrfs_unlock_extent(tree, start, end, &cached_state); return 0; } /* - * a helper for release_folio, this tests for areas of the page that - * are locked or under IO and drops the related state bits if it is safe - * to drop the page. + * A helper for struct address_space_operations::release_folio, this tests for + * areas of the folio that are locked or under IO and drops the related state + * bits if it is safe to drop the folio. */ static bool try_release_extent_state(struct extent_io_tree *tree, struct folio *folio) { + struct extent_state *cached_state = NULL; u64 start = folio_pos(folio); - u64 end = start + PAGE_SIZE - 1; - bool ret; + u64 end = start + folio_size(folio) - 1; + u32 range_bits; + u32 clear_bits; + bool ret = false; + int ret2; - if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) { - ret = false; - } else { - u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | - EXTENT_DELALLOC_NEW | EXTENT_CTLBITS | - EXTENT_QGROUP_RESERVED); - int ret2; + btrfs_get_range_bits(tree, start, end, &range_bits, &cached_state); - /* - * At this point we can safely clear everything except the - * locked bit, the nodatasum bit and the delalloc new bit. - * The delalloc new bit will be cleared by ordered extent - * completion. - */ - ret2 = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL); + /* + * We can release the folio if it's locked only for ordered extent + * completion, since that doesn't require using the folio. + */ + if ((range_bits & EXTENT_LOCKED) && + !(range_bits & EXTENT_FINISHING_ORDERED)) + goto out; + + clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW | + EXTENT_CTLBITS | EXTENT_QGROUP_RESERVED | + EXTENT_FINISHING_ORDERED); + /* + * At this point we can safely clear everything except the locked, + * nodatasum, delalloc new and finishing ordered bits. The delalloc new + * bit will be cleared by ordered extent completion. + */ + ret2 = btrfs_clear_extent_bit(tree, start, end, clear_bits, &cached_state); + /* + * If clear_extent_bit failed for enomem reasons, we can't allow the + * release to continue. + */ + if (ret2 == 0) + ret = true; +out: + btrfs_free_extent_state(cached_state); - /* if clear_extent_bit failed for enomem reasons, - * we can't allow the release to continue. - */ - if (ret2 < 0) - ret = false; - else - ret = true; - } return ret; } @@ -2422,7 +2678,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree, bool try_release_extent_mapping(struct folio *folio, gfp_t mask) { u64 start = folio_pos(folio); - u64 end = start + PAGE_SIZE - 1; + u64 end = start + folio_size(folio) - 1; struct btrfs_inode *inode = folio_to_inode(folio); struct extent_io_tree *io_tree = &inode->io_tree; @@ -2433,18 +2689,19 @@ bool try_release_extent_mapping(struct folio *folio, gfp_t mask) struct extent_map *em; write_lock(&extent_tree->lock); - em = lookup_extent_mapping(extent_tree, start, len); + em = btrfs_lookup_extent_mapping(extent_tree, start, len); if (!em) { write_unlock(&extent_tree->lock); break; } if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) { write_unlock(&extent_tree->lock); - free_extent_map(em); + btrfs_free_extent_map(em); break; } - if (test_range_bit_exists(io_tree, em->start, - extent_map_end(em) - 1, EXTENT_LOCKED)) + if (btrfs_test_range_bit_exists(io_tree, em->start, + btrfs_extent_map_end(em) - 1, + EXTENT_LOCKED)) goto next; /* * If it's not in the list of modified extents, used by a fast @@ -2471,15 +2728,15 @@ remove_em: * fsync performance for workloads with a data size that exceeds * or is close to the system's memory). */ - remove_extent_mapping(inode, em); + btrfs_remove_extent_mapping(inode, em); /* Once for the inode's extent map tree. */ - free_extent_map(em); + btrfs_free_extent_map(em); next: - start = extent_map_end(em); + start = btrfs_extent_map_end(em); write_unlock(&extent_tree->lock); /* Once for us, for the lookup_extent_mapping() reference. */ - free_extent_map(em); + btrfs_free_extent_map(em); if (need_resched()) { /* @@ -2495,11 +2752,6 @@ next: return try_release_extent_state(io_tree, folio); } -static void __free_extent_buffer(struct extent_buffer *eb) -{ - kmem_cache_free(extent_buffer_cache, eb); -} - static int extent_buffer_under_io(const struct extent_buffer *eb) { return (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || @@ -2508,13 +2760,13 @@ static int extent_buffer_under_io(const struct extent_buffer *eb) static bool folio_range_has_eb(struct folio *folio) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; lockdep_assert_held(&folio->mapping->i_private_lock); if (folio_test_private(folio)) { - subpage = folio_get_private(folio); - if (atomic_read(&subpage->eb_refs)) + bfs = folio_get_private(folio); + if (atomic_read(&bfs->eb_refs)) return true; } return false; @@ -2523,6 +2775,7 @@ static bool folio_range_has_eb(struct folio *folio) static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio) { struct btrfs_fs_info *fs_info = eb->fs_info; + struct address_space *mapping = folio->mapping; const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); /* @@ -2530,21 +2783,20 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * be done under the i_private_lock. */ if (mapped) - spin_lock(&folio->mapping->i_private_lock); + spin_lock(&mapping->i_private_lock); if (!folio_test_private(folio)) { if (mapped) - spin_unlock(&folio->mapping->i_private_lock); + spin_unlock(&mapping->i_private_lock); return; } - if (fs_info->nodesize >= PAGE_SIZE) { + if (!btrfs_meta_is_subpage(fs_info)) { /* - * We do this since we'll remove the pages after we've - * removed the eb from the radix tree, so we could race - * and have this page now attached to the new eb. So - * only clear folio if it's still connected to - * this eb. + * We do this since we'll remove the pages after we've removed + * the eb from the xarray, so we could race and have this page + * now attached to the new eb. So only clear folio if it's + * still connected to this eb. */ if (folio_test_private(folio) && folio_get_private(folio) == eb) { BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); @@ -2554,7 +2806,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo folio_detach_private(folio); } if (mapped) - spin_unlock(&folio->mapping->i_private_lock); + spin_unlock(&mapping->i_private_lock); return; } @@ -2564,7 +2816,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * attached to one dummy eb, no sharing. */ if (!mapped) { - btrfs_detach_subpage(fs_info, folio); + btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); return; } @@ -2575,13 +2827,13 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * page range and no unfinished IO. */ if (!folio_range_has_eb(folio)) - btrfs_detach_subpage(fs_info, folio); + btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); - spin_unlock(&folio->mapping->i_private_lock); + spin_unlock(&mapping->i_private_lock); } -/* Release all pages attached to the extent buffer */ -static void btrfs_release_extent_buffer_pages(const struct extent_buffer *eb) +/* Release all folios attached to the extent buffer */ +static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb) { ASSERT(!extent_buffer_under_io(eb)); @@ -2592,9 +2844,6 @@ static void btrfs_release_extent_buffer_pages(const struct extent_buffer *eb) continue; detach_extent_buffer_folio(eb, folio); - - /* One for when we allocated the folio. */ - folio_put(folio); } } @@ -2603,40 +2852,57 @@ static void btrfs_release_extent_buffer_pages(const struct extent_buffer *eb) */ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) { - btrfs_release_extent_buffer_pages(eb); + btrfs_release_extent_buffer_folios(eb); btrfs_leak_debug_del_eb(eb); - __free_extent_buffer(eb); + kmem_cache_free(extent_buffer_cache, eb); } -static struct extent_buffer * -__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, - unsigned long len) +static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) { struct extent_buffer *eb = NULL; eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); eb->start = start; - eb->len = len; + eb->len = fs_info->nodesize; eb->fs_info = fs_info; init_rwsem(&eb->lock); btrfs_leak_debug_add_eb(eb); spin_lock_init(&eb->refs_lock); - atomic_set(&eb->refs, 1); + refcount_set(&eb->refs, 1); - ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE); + ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE); return eb; } +/* + * For use in eb allocation error cleanup paths, as btrfs_release_extent_buffer() + * does not call folio_put(), and we need to set the folios to NULL so that + * btrfs_release_extent_buffer() will not detach them a second time. + */ +static void cleanup_extent_buffer_folios(struct extent_buffer *eb) +{ + const int num_folios = num_extent_folios(eb); + + /* We canont use num_extent_folios() as loop bound as eb->folios changes. */ + for (int i = 0; i < num_folios; i++) { + ASSERT(eb->folios[i]); + detach_extent_buffer_folio(eb, eb->folios[i]); + folio_put(eb->folios[i]); + eb->folios[i] = NULL; + } +} + struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) { struct extent_buffer *new; - int num_folios = num_extent_folios(src); + int num_folios; int ret; - new = __alloc_extent_buffer(src->fs_info, src->start, src->len); + new = __alloc_extent_buffer(src->fs_info, src->start); if (new == NULL) return NULL; @@ -2648,78 +2914,78 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); ret = alloc_eb_folio_array(new, false); - if (ret) { - btrfs_release_extent_buffer(new); - return NULL; - } + if (ret) + goto release_eb; + ASSERT(num_extent_folios(src) == num_extent_folios(new), + "%d != %d", num_extent_folios(src), num_extent_folios(new)); + /* Explicitly use the cached num_extent value from now on. */ + num_folios = num_extent_folios(src); for (int i = 0; i < num_folios; i++) { struct folio *folio = new->folios[i]; ret = attach_extent_buffer_folio(new, folio, NULL); - if (ret < 0) { - btrfs_release_extent_buffer(new); - return NULL; - } + if (ret < 0) + goto cleanup_folios; WARN_ON(folio_test_dirty(folio)); } + for (int i = 0; i < num_folios; i++) + folio_put(new->folios[i]); + copy_extent_buffer_full(new, src); set_extent_buffer_uptodate(new); return new; + +cleanup_folios: + cleanup_extent_buffer_folios(new); +release_eb: + btrfs_release_extent_buffer(new); + return NULL; } -struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len) +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) { struct extent_buffer *eb; - int num_folios = 0; int ret; - eb = __alloc_extent_buffer(fs_info, start, len); + eb = __alloc_extent_buffer(fs_info, start); if (!eb) return NULL; ret = alloc_eb_folio_array(eb, false); if (ret) - goto err; + goto release_eb; - num_folios = num_extent_folios(eb); - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL); if (ret < 0) - goto err; + goto cleanup_folios; } + for (int i = 0; i < num_extent_folios(eb); i++) + folio_put(eb->folios[i]); set_extent_buffer_uptodate(eb); btrfs_set_header_nritems(eb, 0); set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); return eb; -err: - for (int i = 0; i < num_folios; i++) { - if (eb->folios[i]) { - detach_extent_buffer_folio(eb, eb->folios[i]); - folio_put(eb->folios[i]); - } - } - __free_extent_buffer(eb); - return NULL; -} -struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) -{ - return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize); +cleanup_folios: + cleanup_extent_buffer_folios(eb); +release_eb: + btrfs_release_extent_buffer(eb); + return NULL; } static void check_buffer_tree_ref(struct extent_buffer *eb) { int refs; /* - * The TREE_REF bit is first set when the extent_buffer is added - * to the radix tree. It is also reset, if unset, when a new reference - * is created by find_extent_buffer. + * The TREE_REF bit is first set when the extent_buffer is added to the + * xarray. It is also reset, if unset, when a new reference is created + * by find_extent_buffer. * * It is only cleared in two cases: freeing the last non-tree * reference to the extent_buffer when its STALE bit is set or @@ -2731,31 +2997,28 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) * conditions between the calls to check_buffer_tree_ref in those * codepaths and clearing TREE_REF in try_release_extent_buffer. * - * The actual lifetime of the extent_buffer in the radix tree is - * adequately protected by the refcount, but the TREE_REF bit and - * its corresponding reference are not. To protect against this - * class of races, we call check_buffer_tree_ref from the codepaths - * which trigger io. Note that once io is initiated, TREE_REF can no - * longer be cleared, so that is the moment at which any such race is - * best fixed. + * The actual lifetime of the extent_buffer in the xarray is adequately + * protected by the refcount, but the TREE_REF bit and its corresponding + * reference are not. To protect against this class of races, we call + * check_buffer_tree_ref() from the code paths which trigger io. Note that + * once io is initiated, TREE_REF can no longer be cleared, so that is + * the moment at which any such race is best fixed. */ - refs = atomic_read(&eb->refs); + refs = refcount_read(&eb->refs); if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) return; spin_lock(&eb->refs_lock); if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) - atomic_inc(&eb->refs); + refcount_inc(&eb->refs); spin_unlock(&eb->refs_lock); } static void mark_extent_buffer_accessed(struct extent_buffer *eb) { - int num_folios= num_extent_folios(eb); - check_buffer_tree_ref(eb); - for (int i = 0; i < num_folios; i++) + for (int i = 0; i < num_extent_folios(eb); i++) folio_mark_accessed(eb->folios[i]); } @@ -2788,10 +3051,10 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, return eb; } -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *eb, *exists = NULL; int ret; @@ -2803,47 +3066,48 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, return ERR_PTR(-ENOMEM); eb->fs_info = fs_info; again: - ret = radix_tree_preload(GFP_NOFS); - if (ret) { - exists = ERR_PTR(ret); - goto free_eb; + xa_lock_irq(&fs_info->buffer_tree); + exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->nodesize_bits, + NULL, eb, GFP_NOFS); + if (xa_is_err(exists)) { + ret = xa_err(exists); + xa_unlock_irq(&fs_info->buffer_tree); + btrfs_release_extent_buffer(eb); + return ERR_PTR(ret); } - spin_lock(&fs_info->buffer_lock); - ret = radix_tree_insert(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits, eb); - spin_unlock(&fs_info->buffer_lock); - radix_tree_preload_end(); - if (ret == -EEXIST) { - exists = find_extent_buffer(fs_info, start); - if (exists) - goto free_eb; - else + if (exists) { + if (!refcount_inc_not_zero(&exists->refs)) { + /* The extent buffer is being freed, retry. */ + xa_unlock_irq(&fs_info->buffer_tree); goto again; + } + xa_unlock_irq(&fs_info->buffer_tree); + btrfs_release_extent_buffer(eb); + return exists; } + xa_unlock_irq(&fs_info->buffer_tree); check_buffer_tree_ref(eb); - set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); return eb; -free_eb: - btrfs_release_extent_buffer(eb); - return exists; -} +#else + /* Stub to avoid linker error when compiled with optimizations turned off. */ + return NULL; #endif +} -static struct extent_buffer *grab_extent_buffer( - struct btrfs_fs_info *fs_info, struct page *page) +static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, + struct folio *folio) { - struct folio *folio = page_folio(page); struct extent_buffer *exists; - lockdep_assert_held(&page->mapping->i_private_lock); + lockdep_assert_held(&folio->mapping->i_private_lock); /* - * For subpage case, we completely rely on radix tree to ensure we - * don't try to insert two ebs for the same bytenr. So here we always - * return NULL and just continue. + * For subpage case, we completely rely on xarray to ensure we don't try + * to insert two ebs for the same bytenr. So here we always return NULL + * and just continue. */ - if (fs_info->nodesize < PAGE_SIZE) + if (btrfs_meta_is_subpage(fs_info)) return NULL; /* Page not yet attached to an extent buffer */ @@ -2851,40 +3115,42 @@ static struct extent_buffer *grab_extent_buffer( return NULL; /* - * We could have already allocated an eb for this page and attached one + * We could have already allocated an eb for this folio and attached one * so lets see if we can get a ref on the existing eb, and if we can we * know it's good and we can just return that one, else we know we can * just overwrite folio private. */ exists = folio_get_private(folio); - if (atomic_inc_not_zero(&exists->refs)) + if (refcount_inc_not_zero(&exists->refs)) return exists; - WARN_ON(PageDirty(page)); + WARN_ON(folio_test_dirty(folio)); folio_detach_private(folio); return NULL; } -static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) +/* + * Validate alignment constraints of eb at logical address @start. + */ +static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) { if (!IS_ALIGNED(start, fs_info->sectorsize)) { btrfs_err(fs_info, "bad tree block start %llu", start); - return -EINVAL; + return true; } - if (fs_info->nodesize < PAGE_SIZE && - offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) { + if (fs_info->nodesize < PAGE_SIZE && !IS_ALIGNED(start, fs_info->nodesize)) { btrfs_err(fs_info, - "tree block crosses page boundary, start %llu nodesize %u", + "tree block is not nodesize aligned, start %llu nodesize %u", start, fs_info->nodesize); - return -EINVAL; + return true; } if (fs_info->nodesize >= PAGE_SIZE && !PAGE_ALIGNED(start)) { btrfs_err(fs_info, "tree block is not page aligned, start %llu nodesize %u", start, fs_info->nodesize); - return -EINVAL; + return true; } if (!IS_ALIGNED(start, fs_info->nodesize) && !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) { @@ -2892,10 +3158,9 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) "tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance", start, fs_info->nodesize); } - return 0; + return false; } - /* * Return 0 if eb->folios[i] is attached to btree inode successfully. * Return >0 if there is already another extent buffer for the range, @@ -2905,14 +3170,14 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) * The caller needs to free the existing folios and retry using the same order. */ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, - struct btrfs_subpage *prealloc, + struct btrfs_folio_state *prealloc, struct extent_buffer **found_eb_ret) { struct btrfs_fs_info *fs_info = eb->fs_info; struct address_space *mapping = fs_info->btree_inode->i_mapping; - const unsigned long index = eb->start >> PAGE_SHIFT; - struct folio *existing_folio = NULL; + const pgoff_t index = eb->start >> PAGE_SHIFT; + struct folio *existing_folio; int ret; ASSERT(found_eb_ret); @@ -2921,6 +3186,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, ASSERT(eb->folios[i]); retry: + existing_folio = NULL; ret = filemap_add_folio(mapping, eb->folios[i], index + i, GFP_NOFS | __GFP_NOFAIL); if (!ret) @@ -2928,10 +3194,8 @@ retry: existing_folio = filemap_lock_folio(mapping, index + i); /* The page cache only exists for a very short time, just retry. */ - if (IS_ERR(existing_folio)) { - existing_folio = NULL; + if (IS_ERR(existing_folio)) goto retry; - } /* For now, we should only have single-page folios for btree inode. */ ASSERT(folio_nr_pages(existing_folio) == 1); @@ -2944,15 +3208,14 @@ retry: finish: spin_lock(&mapping->i_private_lock); - if (existing_folio && fs_info->nodesize < PAGE_SIZE) { + if (existing_folio && btrfs_meta_is_subpage(fs_info)) { /* We're going to reuse the existing page, can drop our folio now. */ __free_page(folio_page(eb->folios[i], 0)); eb->folios[i] = existing_folio; } else if (existing_folio) { struct extent_buffer *existing_eb; - existing_eb = grab_extent_buffer(fs_info, - folio_page(existing_folio, 0)); + existing_eb = grab_extent_buffer(fs_info, existing_folio); if (existing_eb) { /* The extent buffer still exists, we can use it directly. */ *found_eb_ret = existing_eb; @@ -2973,7 +3236,7 @@ finish: /* * To inform we have an extra eb under allocation, so that * detach_extent_buffer_page() won't release the folio private when the - * eb hasn't been inserted into radix tree yet. + * eb hasn't been inserted into the xarray yet. * * The ref will be decreased when the eb releases the page, in * detach_extent_buffer_page(). Thus needs no special handling in the @@ -2987,12 +3250,10 @@ finish: struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level) { - unsigned long len = fs_info->nodesize; - int num_folios; int attached = 0; struct extent_buffer *eb; struct extent_buffer *existing_eb = NULL; - struct btrfs_subpage *prealloc = NULL; + struct btrfs_folio_state *prealloc = NULL; u64 lockdep_owner = owner_root; bool page_contig = true; int uptodate = 1; @@ -3016,7 +3277,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, if (eb) return eb; - eb = __alloc_extent_buffer(fs_info, start, len); + eb = __alloc_extent_buffer(fs_info, start); if (!eb) return ERR_PTR(-ENOMEM); @@ -3036,8 +3297,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * The memory will be freed by attach_extent_buffer_page() or freed * manually if we exit earlier. */ - if (fs_info->nodesize < PAGE_SIZE) { - prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); + if (btrfs_meta_is_subpage(fs_info)) { + prealloc = btrfs_alloc_folio_state(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA); if (IS_ERR(prealloc)) { ret = PTR_ERR(prealloc); goto out; @@ -3048,13 +3309,12 @@ reallocate: /* Allocate all pages first. */ ret = alloc_eb_folio_array(eb, true); if (ret < 0) { - btrfs_free_subpage(prealloc); + btrfs_free_folio_state(prealloc); goto out; } - num_folios = num_extent_folios(eb); /* Attach all pages to the filemap. */ - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio; ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb); @@ -3083,7 +3343,7 @@ reallocate: * using 0-order folios. */ if (unlikely(ret == -EAGAIN)) { - ASSERT(0); + DEBUG_WARN("folio order mismatch between new eb and filemap"); goto reallocate; } attached++; @@ -3094,7 +3354,7 @@ reallocate: * and free the allocated page. */ folio = eb->folios[i]; - WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len)); + WARN_ON(btrfs_meta_folio_test_dirty(folio, eb)); /* * Check if the current page is physically contiguous with previous eb @@ -3105,15 +3365,14 @@ reallocate: if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0)) page_contig = false; - if (!btrfs_folio_test_uptodate(fs_info, folio, eb->start, eb->len)) + if (!btrfs_meta_folio_test_uptodate(folio, eb)) uptodate = 0; /* * We can't unlock the pages just yet since the extent buffer - * hasn't been properly inserted in the radix tree, this - * opens a race with btree_release_folio which can free a page - * while we are still filling in all pages for the buffer and - * we could crash. + * hasn't been properly inserted into the xarray, this opens a + * race with btree_release_folio() which can free a page while we + * are still filling in all pages for the buffer and we could crash. */ } if (uptodate) @@ -3122,38 +3381,46 @@ reallocate: if (page_contig) eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start); again: - ret = radix_tree_preload(GFP_NOFS); - if (ret) + xa_lock_irq(&fs_info->buffer_tree); + existing_eb = __xa_cmpxchg(&fs_info->buffer_tree, + start >> fs_info->nodesize_bits, NULL, eb, + GFP_NOFS); + if (xa_is_err(existing_eb)) { + ret = xa_err(existing_eb); + xa_unlock_irq(&fs_info->buffer_tree); goto out; - - spin_lock(&fs_info->buffer_lock); - ret = radix_tree_insert(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits, eb); - spin_unlock(&fs_info->buffer_lock); - radix_tree_preload_end(); - if (ret == -EEXIST) { - ret = 0; - existing_eb = find_extent_buffer(fs_info, start); - if (existing_eb) - goto out; - else + } + if (existing_eb) { + if (!refcount_inc_not_zero(&existing_eb->refs)) { + xa_unlock_irq(&fs_info->buffer_tree); goto again; + } + xa_unlock_irq(&fs_info->buffer_tree); + goto out; } + xa_unlock_irq(&fs_info->buffer_tree); + /* add one reference for the tree */ check_buffer_tree_ref(eb); - set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); /* * Now it's safe to unlock the pages because any calls to * btree_release_folio will correctly detect that a page belongs to a * live buffer and won't free them prematurely. */ - for (int i = 0; i < num_folios; i++) - unlock_page(folio_page(eb->folios[i], 0)); + for (int i = 0; i < num_extent_folios(eb); i++) { + folio_unlock(eb->folios[i]); + /* + * A folio that has been added to an address_space mapping + * should not continue holding the refcount from its original + * allocation indefinitely. + */ + folio_put(eb->folios[i]); + } return eb; out: - WARN_ON(!atomic_dec_and_test(&eb->refs)); + WARN_ON(!refcount_dec_and_test(&eb->refs)); /* * Any attached folios need to be detached before we unlock them. This @@ -3163,26 +3430,22 @@ out: * want that to grab this eb, as we're getting ready to free it. So we * have to detach it first and then unlock it. * - * We have to drop our reference and NULL it out here because in the - * subpage case detaching does a btrfs_folio_dec_eb_refs() for our eb. - * Below when we call btrfs_release_extent_buffer() we will call - * detach_extent_buffer_folio() on our remaining pages in the !subpage - * case. If we left eb->folios[i] populated in the subpage case we'd - * double put our reference and be super sad. + * Note: the bounds is num_extent_pages() as we need to go through all slots. */ - for (int i = 0; i < attached; i++) { - ASSERT(eb->folios[i]); - detach_extent_buffer_folio(eb, eb->folios[i]); - unlock_page(folio_page(eb->folios[i], 0)); - folio_put(eb->folios[i]); + for (int i = 0; i < num_extent_pages(eb); i++) { + struct folio *folio = eb->folios[i]; + + if (i < attached) { + ASSERT(folio); + detach_extent_buffer_folio(eb, folio); + folio_unlock(folio); + } else if (!folio) { + continue; + } + + folio_put(folio); eb->folios[i] = NULL; } - /* - * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, - * so it can be cleaned up without utilizing page->mapping. - */ - set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); - btrfs_release_extent_buffer(eb); if (ret < 0) return ERR_PTR(ret); @@ -3195,7 +3458,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) struct extent_buffer *eb = container_of(head, struct extent_buffer, rcu_head); - __free_extent_buffer(eb); + kmem_cache_free(extent_buffer_cache, eb); } static int release_extent_buffer(struct extent_buffer *eb) @@ -3203,27 +3466,35 @@ static int release_extent_buffer(struct extent_buffer *eb) { lockdep_assert_held(&eb->refs_lock); - WARN_ON(atomic_read(&eb->refs) == 0); - if (atomic_dec_and_test(&eb->refs)) { - if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { - struct btrfs_fs_info *fs_info = eb->fs_info; + if (refcount_dec_and_test(&eb->refs)) { + struct btrfs_fs_info *fs_info = eb->fs_info; - spin_unlock(&eb->refs_lock); + spin_unlock(&eb->refs_lock); - spin_lock(&fs_info->buffer_lock); - radix_tree_delete(&fs_info->buffer_radix, - eb->start >> fs_info->sectorsize_bits); - spin_unlock(&fs_info->buffer_lock); - } else { - spin_unlock(&eb->refs_lock); - } + /* + * We're erasing, theoretically there will be no allocations, so + * just use GFP_ATOMIC. + * + * We use cmpxchg instead of erase because we do not know if + * this eb is actually in the tree or not, we could be cleaning + * up an eb that we allocated but never inserted into the tree. + * Thus use cmpxchg to remove it from the tree if it is there, + * or leave the other entry if this isn't in the tree. + * + * The documentation says that putting a NULL value is the same + * as erase as long as XA_FLAGS_ALLOC is not set, which it isn't + * in this case. + */ + xa_cmpxchg_irq(&fs_info->buffer_tree, + eb->start >> fs_info->nodesize_bits, eb, NULL, + GFP_ATOMIC); btrfs_leak_debug_del_eb(eb); - /* Should be safe to release our pages at this point */ - btrfs_release_extent_buffer_pages(eb); + /* Should be safe to release folios at this point. */ + btrfs_release_extent_buffer_folios(eb); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) { - __free_extent_buffer(eb); + kmem_cache_free(extent_buffer_cache, eb); return 1; } #endif @@ -3241,22 +3512,26 @@ void free_extent_buffer(struct extent_buffer *eb) if (!eb) return; - refs = atomic_read(&eb->refs); + refs = refcount_read(&eb->refs); while (1) { - if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3) - || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && - refs == 1)) + if (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) { + if (refs == 1) + break; + } else if (refs <= 3) { break; - if (atomic_try_cmpxchg(&eb->refs, &refs, refs - 1)) + } + + /* Optimization to avoid locking eb->refs_lock. */ + if (atomic_try_cmpxchg(&eb->refs.refs, &refs, refs - 1)) return; } spin_lock(&eb->refs_lock); - if (atomic_read(&eb->refs) == 2 && + if (refcount_read(&eb->refs) == 2 && test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && !extent_buffer_under_io(eb) && test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) - atomic_dec(&eb->refs); + refcount_dec(&eb->refs); /* * I know this is terrible, but it's temporary until we stop tracking @@ -3273,44 +3548,27 @@ void free_extent_buffer_stale(struct extent_buffer *eb) spin_lock(&eb->refs_lock); set_bit(EXTENT_BUFFER_STALE, &eb->bflags); - if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && + if (refcount_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) - atomic_dec(&eb->refs); + refcount_dec(&eb->refs); release_extent_buffer(eb); } -static void btree_clear_folio_dirty(struct folio *folio) +static void btree_clear_folio_dirty_tag(struct folio *folio) { - ASSERT(folio_test_dirty(folio)); + ASSERT(!folio_test_dirty(folio)); ASSERT(folio_test_locked(folio)); - folio_clear_dirty_for_io(folio); xa_lock_irq(&folio->mapping->i_pages); if (!folio_test_dirty(folio)) - __xa_clear_mark(&folio->mapping->i_pages, - folio_index(folio), PAGECACHE_TAG_DIRTY); + __xa_clear_mark(&folio->mapping->i_pages, folio->index, + PAGECACHE_TAG_DIRTY); xa_unlock_irq(&folio->mapping->i_pages); } -static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) -{ - struct btrfs_fs_info *fs_info = eb->fs_info; - struct folio *folio = eb->folios[0]; - bool last; - - /* btree_clear_folio_dirty() needs page locked. */ - folio_lock(folio); - last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len); - if (last) - btree_clear_folio_dirty(folio); - folio_unlock(folio); - WARN_ON(atomic_read(&eb->refs) == 0); -} - void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios; btrfs_assert_tree_write_locked(eb); @@ -3334,129 +3592,99 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) return; + buffer_tree_clear_mark(eb, PAGECACHE_TAG_DIRTY); percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, fs_info->dirty_metadata_batch); - if (eb->fs_info->nodesize < PAGE_SIZE) - return clear_subpage_extent_buffer_dirty(eb); - - num_folios = num_extent_folios(eb); - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; + bool last; if (!folio_test_dirty(folio)) continue; folio_lock(folio); - btree_clear_folio_dirty(folio); + last = btrfs_meta_folio_clear_and_test_dirty(folio, eb); + if (last) + btree_clear_folio_dirty_tag(folio); folio_unlock(folio); } - WARN_ON(atomic_read(&eb->refs) == 0); + WARN_ON(refcount_read(&eb->refs) == 0); } void set_extent_buffer_dirty(struct extent_buffer *eb) { - int num_folios; bool was_dirty; check_buffer_tree_ref(eb); was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - num_folios = num_extent_folios(eb); - WARN_ON(atomic_read(&eb->refs) == 0); + WARN_ON(refcount_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); if (!was_dirty) { - bool subpage = eb->fs_info->nodesize < PAGE_SIZE; + bool subpage = btrfs_meta_is_subpage(eb->fs_info); /* * For subpage case, we can have other extent buffers in the - * same page, and in clear_subpage_extent_buffer_dirty() we + * same page, and in clear_extent_buffer_dirty() we * have to clear page dirty without subpage lock held. * This can cause race where our page gets dirty cleared after * we just set it. * - * Thankfully, clear_subpage_extent_buffer_dirty() has locked + * Thankfully, clear_extent_buffer_dirty() has locked * its page for other reasons, we can use page lock to prevent * the above race. */ if (subpage) - lock_page(folio_page(eb->folios[0], 0)); - for (int i = 0; i < num_folios; i++) - btrfs_folio_set_dirty(eb->fs_info, eb->folios[i], - eb->start, eb->len); + folio_lock(eb->folios[0]); + for (int i = 0; i < num_extent_folios(eb); i++) + btrfs_meta_folio_set_dirty(eb->folios[i], eb); + buffer_tree_set_mark(eb, PAGECACHE_TAG_DIRTY); if (subpage) - unlock_page(folio_page(eb->folios[0], 0)); + folio_unlock(eb->folios[0]); percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, eb->len, eb->fs_info->dirty_metadata_batch); } #ifdef CONFIG_BTRFS_DEBUG - for (int i = 0; i < num_folios; i++) + for (int i = 0; i < num_extent_folios(eb); i++) ASSERT(folio_test_dirty(eb->folios[i])); #endif } void clear_extent_buffer_uptodate(struct extent_buffer *eb) { - struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios = num_extent_folios(eb); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; if (!folio) continue; - /* - * This is special handling for metadata subpage, as regular - * btrfs_is_subpage() can not handle cloned/dummy metadata. - */ - if (fs_info->nodesize >= PAGE_SIZE) - folio_clear_uptodate(folio); - else - btrfs_subpage_clear_uptodate(fs_info, folio, - eb->start, eb->len); + btrfs_meta_folio_clear_uptodate(folio, eb); } } void set_extent_buffer_uptodate(struct extent_buffer *eb) { - struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios = num_extent_folios(eb); set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - for (int i = 0; i < num_folios; i++) { - struct folio *folio = eb->folios[i]; - - /* - * This is special handling for metadata subpage, as regular - * btrfs_is_subpage() can not handle cloned/dummy metadata. - */ - if (fs_info->nodesize >= PAGE_SIZE) - folio_mark_uptodate(folio); - else - btrfs_subpage_set_uptodate(fs_info, folio, - eb->start, eb->len); - } + for (int i = 0; i < num_extent_folios(eb); i++) + btrfs_meta_folio_set_uptodate(eb->folios[i], eb); } static void clear_extent_buffer_reading(struct extent_buffer *eb) { - clear_bit(EXTENT_BUFFER_READING, &eb->bflags); - smp_mb__after_atomic(); - wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING); + clear_and_wake_up_bit(EXTENT_BUFFER_READING, &eb->bflags); } static void end_bbio_meta_read(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; - struct btrfs_fs_info *fs_info = eb->fs_info; bool uptodate = !bbio->bio.bi_status; - struct folio_iter fi; - u32 bio_offset = 0; /* * If the extent buffer is marked UPTODATE before the read operation @@ -3471,25 +3699,10 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio) btrfs_validate_extent_buffer(eb, &bbio->parent_check) < 0) uptodate = false; - if (uptodate) { + if (uptodate) set_extent_buffer_uptodate(eb); - } else { + else clear_extent_buffer_uptodate(eb); - set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); - } - - bio_for_each_folio_all(fi, &bbio->bio) { - struct folio *folio = fi.folio; - u64 start = eb->start + bio_offset; - u32 len = fi.length; - - if (uptodate) - btrfs_folio_set_uptodate(fs_info, folio, start, len); - else - btrfs_folio_clear_uptodate(fs_info, folio, start, len); - - bio_offset += len; - } clear_extent_buffer_reading(eb); free_extent_buffer(eb); @@ -3497,11 +3710,10 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio) bio_put(&bbio->bio); } -int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, - const struct btrfs_tree_parent_check *check) +int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, + const struct btrfs_tree_parent_check *check) { struct btrfs_bio *bbio; - bool ret; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -3516,7 +3728,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, /* Someone else is already reading the buffer, just wait for it. */ if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) - goto done; + return 0; /* * Between the initial test_bit(EXTENT_BUFFER_UPTODATE) and the above @@ -3529,10 +3741,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, return 0; } - clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = 0; check_buffer_tree_ref(eb); - atomic_inc(&eb->refs); + refcount_inc(&eb->refs); bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, REQ_OP_READ | REQ_META, eb->fs_info, @@ -3541,29 +3752,31 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; memcpy(&bbio->parent_check, check, sizeof(*check)); - if (eb->fs_info->nodesize < PAGE_SIZE) { - ret = bio_add_folio(&bbio->bio, eb->folios[0], eb->len, - eb->start - folio_pos(eb->folios[0])); - ASSERT(ret); - } else { - int num_folios = num_extent_folios(eb); - - for (int i = 0; i < num_folios; i++) { - struct folio *folio = eb->folios[i]; + for (int i = 0; i < num_extent_folios(eb); i++) { + struct folio *folio = eb->folios[i]; + u64 range_start = max_t(u64, eb->start, folio_pos(folio)); + u32 range_len = min_t(u64, folio_end(folio), + eb->start + eb->len) - range_start; - ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); - ASSERT(ret); - } + bio_add_folio_nofail(&bbio->bio, folio, range_len, + offset_in_folio(folio, range_start)); } btrfs_submit_bbio(bbio, mirror_num); + return 0; +} -done: - if (wait == WAIT_COMPLETE) { - wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); - if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) - return -EIO; - } +int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, + const struct btrfs_tree_parent_check *check) +{ + int ret; + ret = read_extent_buffer_pages_nowait(eb, mirror_num, check); + if (ret < 0) + return ret; + + wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); + if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) + return -EIO; return 0; } @@ -3573,7 +3786,7 @@ static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, btrfs_warn(eb->fs_info, "access to eb bytenr %llu len %u out of range start %lu len %lu", eb->start, eb->len, start, len); - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN(); return true; } @@ -3735,7 +3948,7 @@ static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i) if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) return; - if (fs_info->nodesize < PAGE_SIZE) { + if (btrfs_meta_is_subpage(fs_info)) { folio = eb->folios[0]; ASSERT(i == 0); if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio, @@ -3921,8 +4134,8 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb, * @start: offset of the bitmap item in the extent buffer * @nr: bit number to test */ -int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, - unsigned long nr) +bool extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, + unsigned long nr) { unsigned long i; size_t offset; @@ -4109,82 +4322,26 @@ void memmove_extent_buffer(const struct extent_buffer *dst, } } -#define GANG_LOOKUP_SIZE 16 -static struct extent_buffer *get_next_extent_buffer( - const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr) -{ - struct extent_buffer *gang[GANG_LOOKUP_SIZE]; - struct extent_buffer *found = NULL; - u64 folio_start = folio_pos(folio); - u64 cur = folio_start; - - ASSERT(in_range(bytenr, folio_start, PAGE_SIZE)); - lockdep_assert_held(&fs_info->buffer_lock); - - while (cur < folio_start + PAGE_SIZE) { - int ret; - int i; - - ret = radix_tree_gang_lookup(&fs_info->buffer_radix, - (void **)gang, cur >> fs_info->sectorsize_bits, - min_t(unsigned int, GANG_LOOKUP_SIZE, - PAGE_SIZE / fs_info->nodesize)); - if (ret == 0) - goto out; - for (i = 0; i < ret; i++) { - /* Already beyond page end */ - if (gang[i]->start >= folio_start + PAGE_SIZE) - goto out; - /* Found one */ - if (gang[i]->start >= bytenr) { - found = gang[i]; - goto out; - } - } - cur = gang[ret - 1]->start + gang[ret - 1]->len; - } -out: - return found; -} - static int try_release_subpage_extent_buffer(struct folio *folio) { struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); - u64 cur = folio_pos(folio); - const u64 end = cur + PAGE_SIZE; + struct extent_buffer *eb; + unsigned long start = (folio_pos(folio) >> fs_info->nodesize_bits); + unsigned long index = start; + unsigned long end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1; int ret; - while (cur < end) { - struct extent_buffer *eb = NULL; - - /* - * Unlike try_release_extent_buffer() which uses folio private - * to grab buffer, for subpage case we rely on radix tree, thus - * we need to ensure radix tree consistency. - * - * We also want an atomic snapshot of the radix tree, thus go - * with spinlock rather than RCU. - */ - spin_lock(&fs_info->buffer_lock); - eb = get_next_extent_buffer(fs_info, folio, cur); - if (!eb) { - /* No more eb in the page range after or at cur */ - spin_unlock(&fs_info->buffer_lock); - break; - } - cur = eb->start + eb->len; - + xa_lock_irq(&fs_info->buffer_tree); + xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) { /* * The same as try_release_extent_buffer(), to ensure the eb * won't disappear out from under us. */ spin_lock(&eb->refs_lock); - if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { + if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); - spin_unlock(&fs_info->buffer_lock); - break; + continue; } - spin_unlock(&fs_info->buffer_lock); /* * If tree ref isn't set then we know the ref on this eb is a @@ -4201,8 +4358,12 @@ static int try_release_subpage_extent_buffer(struct folio *folio) * check the folio private at the end. And * release_extent_buffer() will release the refs_lock. */ + xa_unlock_irq(&fs_info->buffer_tree); release_extent_buffer(eb); + xa_lock_irq(&fs_info->buffer_tree); } + xa_unlock_irq(&fs_info->buffer_tree); + /* * Finally to check if we have cleared folio private, as if we have * released all ebs in the page, the folio private should be cleared now. @@ -4221,7 +4382,7 @@ int try_release_extent_buffer(struct folio *folio) { struct extent_buffer *eb; - if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) + if (btrfs_meta_is_subpage(folio_to_fs_info(folio))) return try_release_subpage_extent_buffer(folio); /* @@ -4243,7 +4404,7 @@ int try_release_extent_buffer(struct folio *folio) * this page. */ spin_lock(&eb->refs_lock); - if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { + if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); spin_unlock(&folio->mapping->i_private_lock); return 0; @@ -4294,7 +4455,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, return; } - ret = read_extent_buffer_pages(eb, WAIT_NONE, 0, &check); + ret = read_extent_buffer_pages_nowait(eb, 0, &check); if (ret < 0) free_extent_buffer_stale(eb); else diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 8a36117ed453..61130786b9a3 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -38,16 +38,10 @@ struct btrfs_tree_parent_check; enum { EXTENT_BUFFER_UPTODATE, EXTENT_BUFFER_DIRTY, - EXTENT_BUFFER_CORRUPT, - /* this got triggered by readahead */ - EXTENT_BUFFER_READAHEAD, EXTENT_BUFFER_TREE_REF, EXTENT_BUFFER_STALE, EXTENT_BUFFER_WRITEBACK, - /* read IO error */ - EXTENT_BUFFER_READ_ERR, EXTENT_BUFFER_UNMAPPED, - EXTENT_BUFFER_IN_TREE, /* write IO error */ EXTENT_BUFFER_WRITE_ERR, /* Indicate the extent buffer is written zeroed out (for zoned) */ @@ -79,7 +73,7 @@ enum { * single word in a bitmap may straddle two pages in the extent buffer. */ #define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE) -#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1) +#define BYTE_MASK ((1U << BITS_PER_BYTE) - 1) #define BITMAP_FIRST_BYTE_MASK(start) \ ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK) #define BITMAP_LAST_BYTE_MASK(nbits) \ @@ -104,7 +98,7 @@ struct extent_buffer { void *addr; spinlock_t refs_lock; - atomic_t refs; + refcount_t refs; int read_mirror; /* >= 0 if eb belongs to a log tree, -1 otherwise */ s8 log_index; @@ -246,15 +240,13 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); +void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); void btrfs_readahead(struct readahead_control *rac); int set_folio_extent_mapped(struct folio *folio); -int set_page_extent_mapped(struct page *page); void clear_folio_extent_mapped(struct folio *folio); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level); -struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len); struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src); @@ -262,17 +254,23 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); void free_extent_buffer(struct extent_buffer *eb); void free_extent_buffer_stale(struct extent_buffer *eb); -#define WAIT_NONE 0 -#define WAIT_COMPLETE 1 -#define WAIT_PAGE_LOCK 2 -int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, +int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, const struct btrfs_tree_parent_check *parent_check); -void wait_on_extent_buffer_writeback(struct extent_buffer *eb); +int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, + const struct btrfs_tree_parent_check *parent_check); + +static inline void wait_on_extent_buffer_writeback(struct extent_buffer *eb) +{ + wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, + TASK_UNINTERRUPTIBLE); +} + void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, u64 gen, int level); void btrfs_readahead_node_child(struct extent_buffer *node, int slot); -static inline int num_extent_pages(const struct extent_buffer *eb) +/* Note: this can be used in for loops without caching the value in a variable. */ +static inline int __pure num_extent_pages(const struct extent_buffer *eb) { /* * For sectorsize == PAGE_SIZE case, since nodesize is always aligned to @@ -290,9 +288,13 @@ static inline int num_extent_pages(const struct extent_buffer *eb) * As we can have either one large folio covering the whole eb * (either nodesize <= PAGE_SIZE, or high order folio), or multiple * single-paged folios. + * + * Note: this can be used in for loops without caching the value in a variable. */ -static inline int num_extent_folios(const struct extent_buffer *eb) +static inline int __pure num_extent_folios(const struct extent_buffer *eb) { + if (!eb->folios[0]) + return 0; if (folio_order(eb->folios[0])) return 1; return num_extent_pages(eb); @@ -343,8 +345,8 @@ void memmove_extent_buffer(const struct extent_buffer *dst, unsigned long len); void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, unsigned long len); -int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, - unsigned long pos); +bool extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, + unsigned long pos); void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len); void extent_buffer_bitmap_clear(const struct extent_buffer *eb, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 67ce85ff0ae2..57f52585a6dd 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -13,7 +13,7 @@ static struct kmem_cache *extent_map_cache; -int __init extent_map_init(void) +int __init btrfs_extent_map_init(void) { extent_map_cache = kmem_cache_create("btrfs_extent_map", sizeof(struct extent_map), 0, 0, NULL); @@ -22,7 +22,7 @@ int __init extent_map_init(void) return 0; } -void __cold extent_map_exit(void) +void __cold btrfs_extent_map_exit(void) { kmem_cache_destroy(extent_map_cache); } @@ -31,7 +31,7 @@ void __cold extent_map_exit(void) * Initialize the extent tree @tree. Should be called for each new inode or * other user of the extent_map interface. */ -void extent_map_tree_init(struct extent_map_tree *tree) +void btrfs_extent_map_tree_init(struct extent_map_tree *tree) { tree->root = RB_ROOT; INIT_LIST_HEAD(&tree->modified_extents); @@ -42,7 +42,7 @@ void extent_map_tree_init(struct extent_map_tree *tree) * Allocate a new extent_map structure. The new structure is returned with a * reference count of one and needs to be freed using free_extent_map() */ -struct extent_map *alloc_extent_map(void) +struct extent_map *btrfs_alloc_extent_map(void) { struct extent_map *em; em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); @@ -58,12 +58,12 @@ struct extent_map *alloc_extent_map(void) * Drop the reference out on @em by one and free the structure if the reference * count hits zero. */ -void free_extent_map(struct extent_map *em) +void btrfs_free_extent_map(struct extent_map *em) { if (!em) return; if (refcount_dec_and_test(&em->refs)) { - WARN_ON(extent_map_in_tree(em)); + WARN_ON(btrfs_extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); kmem_cache_free(extent_map_cache, em); } @@ -84,7 +84,7 @@ static void remove_em(struct btrfs_inode *inode, struct extent_map *em) rb_erase(&em->rb_node, &inode->extent_tree.root); RB_CLEAR_NODE(&em->rb_node); - if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root))) + if (!btrfs_is_testing(fs_info) && btrfs_is_fstree(btrfs_root_id(inode->root))) percpu_counter_dec(&fs_info->evictable_extent_maps); } @@ -102,19 +102,19 @@ static int tree_insert(struct rb_root *root, struct extent_map *em) if (em->start < entry->start) p = &(*p)->rb_left; - else if (em->start >= extent_map_end(entry)) + else if (em->start >= btrfs_extent_map_end(entry)) p = &(*p)->rb_right; else return -EEXIST; } orig_parent = parent; - while (parent && em->start >= extent_map_end(entry)) { + while (parent && em->start >= btrfs_extent_map_end(entry)) { parent = rb_next(parent); entry = rb_entry(parent, struct extent_map, rb_node); } if (parent) - if (end > entry->start && em->start < extent_map_end(entry)) + if (end > entry->start && em->start < btrfs_extent_map_end(entry)) return -EEXIST; parent = orig_parent; @@ -124,7 +124,7 @@ static int tree_insert(struct rb_root *root, struct extent_map *em) entry = rb_entry(parent, struct extent_map, rb_node); } if (parent) - if (end > entry->start && em->start < extent_map_end(entry)) + if (end > entry->start && em->start < btrfs_extent_map_end(entry)) return -EEXIST; rb_link_node(&em->rb_node, orig_parent, p); @@ -136,8 +136,8 @@ static int tree_insert(struct rb_root *root, struct extent_map *em) * Search through the tree for an extent_map with a given offset. If it can't * be found, try to find some neighboring extents */ -static struct rb_node *__tree_search(struct rb_root *root, u64 offset, - struct rb_node **prev_or_next_ret) +static struct rb_node *tree_search(struct rb_root *root, u64 offset, + struct rb_node **prev_or_next_ret) { struct rb_node *n = root->rb_node; struct rb_node *prev = NULL; @@ -154,14 +154,14 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, if (offset < entry->start) n = n->rb_left; - else if (offset >= extent_map_end(entry)) + else if (offset >= btrfs_extent_map_end(entry)) n = n->rb_right; else return n; } orig_prev = prev; - while (prev && offset >= extent_map_end(prev_entry)) { + while (prev && offset >= btrfs_extent_map_end(prev_entry)) { prev = rb_next(prev); prev_entry = rb_entry(prev, struct extent_map, rb_node); } @@ -188,14 +188,14 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, static inline u64 extent_map_block_len(const struct extent_map *em) { - if (extent_map_is_compressed(em)) + if (btrfs_extent_map_is_compressed(em)) return em->disk_num_bytes; return em->len; } static inline u64 extent_map_block_end(const struct extent_map *em) { - const u64 block_start = extent_map_block_start(em); + const u64 block_start = btrfs_extent_map_block_start(em); const u64 block_end = block_start + extent_map_block_len(em); if (block_end < block_start) @@ -210,7 +210,7 @@ static bool can_merge_extent_map(const struct extent_map *em) return false; /* Don't merge compressed extents, we need to know their actual size. */ - if (extent_map_is_compressed(em)) + if (btrfs_extent_map_is_compressed(em)) return false; if (em->flags & EXTENT_FLAG_LOGGING) @@ -230,7 +230,7 @@ static bool can_merge_extent_map(const struct extent_map *em) /* Check to see if two extent_map structs are adjacent and safe to merge. */ static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next) { - if (extent_map_end(prev) != next->start) + if (btrfs_extent_map_end(prev) != next->start) return false; /* @@ -242,7 +242,7 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma return false; if (next->disk_bytenr < EXTENT_MAP_LAST_BYTE - 1) - return extent_map_block_start(next) == extent_map_block_end(prev); + return btrfs_extent_map_block_start(next) == extent_map_block_end(prev); /* HOLES and INLINE extents. */ return next->disk_bytenr == prev->disk_bytenr; @@ -270,8 +270,8 @@ static void merge_ondisk_extents(const struct extent_map *prev, const struct ext u64 new_offset; /* @prev and @next should not be compressed. */ - ASSERT(!extent_map_is_compressed(prev)); - ASSERT(!extent_map_is_compressed(next)); + ASSERT(!btrfs_extent_map_is_compressed(prev)); + ASSERT(!btrfs_extent_map_is_compressed(next)); /* * There are two different cases where @prev and @next can be merged. @@ -327,9 +327,9 @@ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map if (em->offset + em->len > em->ram_bytes) dump_extent_map(fs_info, "ram_bytes too small", em); if (em->offset + em->len > em->disk_num_bytes && - !extent_map_is_compressed(em)) + !btrfs_extent_map_is_compressed(em)) dump_extent_map(fs_info, "disk_num_bytes too small", em); - if (!extent_map_is_compressed(em) && + if (!btrfs_extent_map_is_compressed(em) && em->ram_bytes != em->disk_num_bytes) dump_extent_map(fs_info, "ram_bytes mismatch with disk_num_bytes for non-compressed em", @@ -361,8 +361,8 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) if (em->start != 0) { rb = rb_prev(&em->rb_node); - if (rb) - merge = rb_entry(rb, struct extent_map, rb_node); + merge = rb_entry_safe(rb, struct extent_map, rb_node); + if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) { em->start = merge->start; em->len += merge->len; @@ -374,13 +374,13 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) validate_extent_map(fs_info, em); remove_em(inode, merge); - free_extent_map(merge); + btrfs_free_extent_map(merge); } } rb = rb_next(&em->rb_node); - if (rb) - merge = rb_entry(rb, struct extent_map, rb_node); + merge = rb_entry_safe(rb, struct extent_map, rb_node); + if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) { em->len += merge->len; if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) @@ -389,7 +389,7 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) em->generation = max(em->generation, merge->generation); em->flags |= EXTENT_FLAG_MERGED; remove_em(inode, merge); - free_extent_map(merge); + btrfs_free_extent_map(merge); } } @@ -409,7 +409,7 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) * -ENOENT when the extent is not found in the tree * -EUCLEAN if the found extent does not match the expected start */ -int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) +int btrfs_unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map_tree *tree = &inode->extent_tree; @@ -417,7 +417,7 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) struct extent_map *em; write_lock(&tree->lock); - em = lookup_extent_mapping(tree, start, len); + em = btrfs_lookup_extent_mapping(tree, start, len); if (WARN_ON(!em)) { btrfs_warn(fs_info, @@ -444,17 +444,17 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) out: write_unlock(&tree->lock); - free_extent_map(em); + btrfs_free_extent_map(em); return ret; } -void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em) +void btrfs_clear_em_logging(struct btrfs_inode *inode, struct extent_map *em) { lockdep_assert_held_write(&inode->extent_tree.lock); em->flags &= ~EXTENT_FLAG_LOGGING; - if (extent_map_in_tree(em)) + if (btrfs_extent_map_in_tree(em)) try_merge_map(inode, em); } @@ -502,22 +502,21 @@ static int add_extent_mapping(struct btrfs_inode *inode, setup_extent_mapping(inode, em, modified); - if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(root))) + if (!btrfs_is_testing(fs_info) && btrfs_is_fstree(btrfs_root_id(root))) percpu_counter_inc(&fs_info->evictable_extent_maps); return 0; } -static struct extent_map * -__lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len, int strict) +static struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len, int strict) { struct extent_map *em; struct rb_node *rb_node; struct rb_node *prev_or_next = NULL; u64 end = range_end(start, len); - rb_node = __tree_search(&tree->root, start, &prev_or_next); + rb_node = tree_search(&tree->root, start, &prev_or_next); if (!rb_node) { if (prev_or_next) rb_node = prev_or_next; @@ -527,7 +526,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree, em = rb_entry(rb_node, struct extent_map, rb_node); - if (strict && !(end > em->start && start < extent_map_end(em))) + if (strict && !(end > em->start && start < btrfs_extent_map_end(em))) return NULL; refcount_inc(&em->refs); @@ -546,10 +545,10 @@ __lookup_extent_mapping(struct extent_map_tree *tree, * intersect, so check the object returned carefully to make sure that no * additional lookups are needed. */ -struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len) +struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) { - return __lookup_extent_mapping(tree, start, len, 1); + return lookup_extent_mapping(tree, start, len, 1); } /* @@ -564,10 +563,10 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, * * If one can't be found, any nearby extent may be returned */ -struct extent_map *search_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len) +struct extent_map *btrfs_search_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) { - return __lookup_extent_mapping(tree, start, len, 0); + return lookup_extent_mapping(tree, start, len, 0); } /* @@ -579,7 +578,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, * Remove @em from the extent tree of @inode. No reference counts are dropped, * and no checks are done to see if the range is in use. */ -void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em) +void btrfs_remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em) { struct extent_map_tree *tree = &inode->extent_tree; @@ -605,7 +604,7 @@ static void replace_extent_mapping(struct btrfs_inode *inode, validate_extent_map(fs_info, new); WARN_ON(cur->flags & EXTENT_FLAG_PINNED); - ASSERT(extent_map_in_tree(cur)); + ASSERT(btrfs_extent_map_in_tree(cur)); if (!(cur->flags & EXTENT_FLAG_LOGGING)) list_del_init(&cur->list); rb_replace_node(&cur->rb_node, &new->rb_node, &tree->root); @@ -651,7 +650,7 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode, u64 end; u64 start_diff; - if (map_start < em->start || map_start >= extent_map_end(em)) + if (map_start < em->start || map_start >= btrfs_extent_map_end(em)) return -EINVAL; if (existing->start > map_start) { @@ -662,10 +661,10 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode, next = next_extent_map(prev); } - start = prev ? extent_map_end(prev) : em->start; + start = prev ? btrfs_extent_map_end(prev) : em->start; start = max_t(u64, start, em->start); - end = next ? next->start : extent_map_end(em); - end = min_t(u64, end, extent_map_end(em)); + end = next ? next->start : btrfs_extent_map_end(em); + end = min_t(u64, end, btrfs_extent_map_end(em)); start_diff = start - em->start; em->start = start; em->len = end - start; @@ -716,7 +715,7 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode, if (ret == -EEXIST) { struct extent_map *existing; - existing = search_extent_mapping(&inode->extent_tree, start, len); + existing = btrfs_search_extent_mapping(&inode->extent_tree, start, len); trace_btrfs_handle_em_exist(fs_info, existing, em, start, len); @@ -725,8 +724,8 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode, * extent causing the -EEXIST. */ if (start >= existing->start && - start < extent_map_end(existing)) { - free_extent_map(em); + start < btrfs_extent_map_end(existing)) { + btrfs_free_extent_map(em); *em_in = existing; ret = 0; } else { @@ -739,14 +738,14 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode, */ ret = merge_extent_mapping(inode, existing, em, start); if (WARN_ON(ret)) { - free_extent_map(em); + btrfs_free_extent_map(em); *em_in = NULL; btrfs_warn(fs_info, "extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu", - existing->start, extent_map_end(existing), + existing->start, btrfs_extent_map_end(existing), orig_start, orig_start + orig_len, start); } - free_extent_map(existing); + btrfs_free_extent_map(existing); } } @@ -772,8 +771,8 @@ static void drop_all_extent_maps_fast(struct btrfs_inode *inode) em = rb_entry(node, struct extent_map, rb_node); em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING); - remove_extent_mapping(inode, em); - free_extent_map(em); + btrfs_remove_extent_mapping(inode, em); + btrfs_free_extent_map(em); if (cond_resched_rwlock_write(&tree->lock)) node = rb_first(&tree->root); @@ -826,15 +825,15 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, * range ends after our range (and they might be the same extent map), * because we need to split those two extent maps at the boundaries. */ - split = alloc_extent_map(); - split2 = alloc_extent_map(); + split = btrfs_alloc_extent_map(); + split2 = btrfs_alloc_extent_map(); write_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); + em = btrfs_lookup_extent_mapping(em_tree, start, len); while (em) { /* extent_map_end() returns exclusive value (last byte + 1). */ - const u64 em_end = extent_map_end(em); + const u64 em_end = btrfs_extent_map_end(em); struct extent_map *next_em = NULL; u64 gen; unsigned long flags; @@ -898,7 +897,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, split->generation = gen; split->flags = flags; replace_extent_mapping(inode, em, split, modified); - free_extent_map(split); + btrfs_free_extent_map(split); split = split2; split2 = NULL; } @@ -925,7 +924,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, split->ram_bytes = split->len; } - if (extent_map_in_tree(em)) { + if (btrfs_extent_map_in_tree(em)) { replace_extent_mapping(inode, em, split, modified); } else { int ret; @@ -936,11 +935,11 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, if (WARN_ON(ret != 0) && modified) btrfs_set_inode_full_sync(inode); } - free_extent_map(split); + btrfs_free_extent_map(split); split = NULL; } remove_em: - if (extent_map_in_tree(em)) { + if (btrfs_extent_map_in_tree(em)) { /* * If the extent map is still in the tree it means that * either of the following is true: @@ -965,25 +964,25 @@ remove_em: ASSERT(!split); btrfs_set_inode_full_sync(inode); } - remove_extent_mapping(inode, em); + btrfs_remove_extent_mapping(inode, em); } /* * Once for the tree reference (we replaced or removed the * extent map from the tree). */ - free_extent_map(em); + btrfs_free_extent_map(em); next: /* Once for us (for our lookup reference). */ - free_extent_map(em); + btrfs_free_extent_map(em); em = next_em; } write_unlock(&em_tree->lock); - free_extent_map(split); - free_extent_map(split2); + btrfs_free_extent_map(split); + btrfs_free_extent_map(split2); } /* @@ -1007,7 +1006,7 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode, struct extent_map_tree *tree = &inode->extent_tree; int ret; - ASSERT(!extent_map_in_tree(new_em)); + ASSERT(!btrfs_extent_map_in_tree(new_em)); /* * The caller has locked an appropriate file range in the inode's io @@ -1033,8 +1032,8 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode, * * This function is used when an ordered_extent needs to be split. */ -int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, - u64 new_logical) +int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, + u64 new_logical) { struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; @@ -1046,25 +1045,25 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, ASSERT(pre != 0); ASSERT(pre < len); - split_pre = alloc_extent_map(); + split_pre = btrfs_alloc_extent_map(); if (!split_pre) return -ENOMEM; - split_mid = alloc_extent_map(); + split_mid = btrfs_alloc_extent_map(); if (!split_mid) { ret = -ENOMEM; goto out_free_pre; } - lock_extent(&inode->io_tree, start, start + len - 1, NULL); + btrfs_lock_extent(&inode->io_tree, start, start + len - 1, NULL); write_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); + em = btrfs_lookup_extent_mapping(em_tree, start, len); if (!em) { ret = -EIO; goto out_unlock; } ASSERT(em->len == len); - ASSERT(!extent_map_is_compressed(em)); + ASSERT(!btrfs_extent_map_is_compressed(em)); ASSERT(em->disk_bytenr < EXTENT_MAP_LAST_BYTE); ASSERT(em->flags & EXTENT_FLAG_PINNED); ASSERT(!(em->flags & EXTENT_FLAG_LOGGING)); @@ -1093,7 +1092,7 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, /* Insert the middle extent_map. */ split_mid->start = em->start + pre; split_mid->len = em->len - pre; - split_mid->disk_bytenr = extent_map_block_start(em) + pre; + split_mid->disk_bytenr = btrfs_extent_map_block_start(em) + pre; split_mid->disk_num_bytes = split_mid->len; split_mid->offset = 0; split_mid->ram_bytes = split_mid->len; @@ -1102,16 +1101,16 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, add_extent_mapping(inode, split_mid, 1); /* Once for us */ - free_extent_map(em); + btrfs_free_extent_map(em); /* Once for the tree */ - free_extent_map(em); + btrfs_free_extent_map(em); out_unlock: write_unlock(&em_tree->lock); - unlock_extent(&inode->io_tree, start, start + len - 1, NULL); - free_extent_map(split_mid); + btrfs_unlock_extent(&inode->io_tree, start, start + len - 1, NULL); + btrfs_free_extent_map(split_mid); out_free_pre: - free_extent_map(split_pre); + btrfs_free_extent_map(split_pre); return ret; } @@ -1128,6 +1127,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c long nr_dropped = 0; struct rb_node *node; + lockdep_assert_held_write(&tree->lock); + /* * Take the mmap lock so that we serialize with the inode logging phase * of fsync because we may need to set the full sync flag on the inode, @@ -1139,28 +1140,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c * to find new extents, which may not be there yet because ordered * extents haven't completed yet. * - * We also do a try lock because otherwise we could deadlock. This is - * because the shrinker for this filesystem may be invoked while we are - * in a path that is holding the mmap lock in write mode. For example in - * a reflink operation while COWing an extent buffer, when allocating - * pages for a new extent buffer and under memory pressure, the shrinker - * may be invoked, and therefore we would deadlock by attempting to read - * lock the mmap lock while we are holding already a write lock on it. + * We also do a try lock because we don't want to block for too long and + * we are holding the extent map tree's lock in write mode. */ if (!down_read_trylock(&inode->i_mmap_lock)) return 0; - /* - * We want to be fast so if the lock is busy we don't want to spend time - * waiting for it - either some task is about to do IO for the inode or - * we may have another task shrinking extent maps, here in this code, so - * skip this inode. - */ - if (!write_trylock(&tree->lock)) { - up_read(&inode->i_mmap_lock); - return 0; - } - node = rb_first(&tree->root); while (node) { struct rb_node *next = rb_next(node); @@ -1182,10 +1167,10 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c if (!list_empty(&em->list) && em->generation >= cur_fs_gen) btrfs_set_inode_full_sync(inode); - remove_extent_mapping(inode, em); + btrfs_remove_extent_mapping(inode, em); trace_btrfs_extent_map_shrinker_remove_em(inode, em); /* Drop the reference for the tree. */ - free_extent_map(em); + btrfs_free_extent_map(em); nr_dropped++; next: if (ctx->scanned >= ctx->nr_to_scan) @@ -1201,12 +1186,61 @@ next: break; node = next; } - write_unlock(&tree->lock); up_read(&inode->i_mmap_lock); return nr_dropped; } +static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root, + u64 min_ino) +{ + struct btrfs_inode *inode; + unsigned long from = min_ino; + + xa_lock(&root->inodes); + while (true) { + struct extent_map_tree *tree; + + inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); + if (!inode) + break; + + tree = &inode->extent_tree; + + /* + * We want to be fast so if the lock is busy we don't want to + * spend time waiting for it (some task is about to do IO for + * the inode). + */ + if (!write_trylock(&tree->lock)) + goto next; + + /* + * Skip inode if it doesn't have loaded extent maps, so we avoid + * getting a reference and doing an iput later. This includes + * cases like files that were opened for things like stat(2), or + * files with all extent maps previously released through the + * release folio callback (btrfs_release_folio()) or released in + * a previous run, or directories which never have extent maps. + */ + if (RB_EMPTY_ROOT(&tree->root)) { + write_unlock(&tree->lock); + goto next; + } + + if (igrab(&inode->vfs_inode)) + break; + + write_unlock(&tree->lock); +next: + from = btrfs_ino(inode) + 1; + cond_resched_lock(&root->inodes.xa_lock); + } + xa_unlock(&root->inodes); + + return inode; +} + static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -1214,21 +1248,21 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx long nr_dropped = 0; u64 min_ino = fs_info->em_shrinker_last_ino + 1; - inode = btrfs_find_first_inode(root, min_ino); + inode = find_first_inode_to_shrink(root, min_ino); while (inode) { nr_dropped += btrfs_scan_inode(inode, ctx); + write_unlock(&inode->extent_tree.lock); min_ino = btrfs_ino(inode) + 1; fs_info->em_shrinker_last_ino = btrfs_ino(inode); - btrfs_add_delayed_iput(inode); + iput(&inode->vfs_inode); - if (ctx->scanned >= ctx->nr_to_scan || - btrfs_fs_closing(inode->root->fs_info)) + if (ctx->scanned >= ctx->nr_to_scan || btrfs_fs_closing(fs_info)) break; cond_resched(); - inode = btrfs_find_first_inode(root, min_ino); + inode = find_first_inode_to_shrink(root, min_ino); } if (inode) { @@ -1303,7 +1337,7 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work) if (!root) continue; - if (is_fstree(btrfs_root_id(root))) + if (btrfs_is_fstree(btrfs_root_id(root))) nr_dropped += btrfs_scan_root(root, &ctx); btrfs_put_root(root); diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index cd123b266b64..d4b81ee4d97b 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -108,8 +108,8 @@ struct extent_map_tree { struct btrfs_inode; -static inline void extent_map_set_compression(struct extent_map *em, - enum btrfs_compression_type type) +static inline void btrfs_extent_map_set_compression(struct extent_map *em, + enum btrfs_compression_type type) { if (type == BTRFS_COMPRESS_ZLIB) em->flags |= EXTENT_FLAG_COMPRESS_ZLIB; @@ -119,7 +119,8 @@ static inline void extent_map_set_compression(struct extent_map *em, em->flags |= EXTENT_FLAG_COMPRESS_ZSTD; } -static inline enum btrfs_compression_type extent_map_compression(const struct extent_map *em) +static inline enum btrfs_compression_type btrfs_extent_map_compression( + const struct extent_map *em) { if (em->flags & EXTENT_FLAG_COMPRESS_ZLIB) return BTRFS_COMPRESS_ZLIB; @@ -137,50 +138,50 @@ static inline enum btrfs_compression_type extent_map_compression(const struct ex * More efficient way to determine if extent is compressed, instead of using * 'extent_map_compression() != BTRFS_COMPRESS_NONE'. */ -static inline bool extent_map_is_compressed(const struct extent_map *em) +static inline bool btrfs_extent_map_is_compressed(const struct extent_map *em) { return (em->flags & (EXTENT_FLAG_COMPRESS_ZLIB | EXTENT_FLAG_COMPRESS_LZO | EXTENT_FLAG_COMPRESS_ZSTD)) != 0; } -static inline int extent_map_in_tree(const struct extent_map *em) +static inline int btrfs_extent_map_in_tree(const struct extent_map *em) { return !RB_EMPTY_NODE(&em->rb_node); } -static inline u64 extent_map_block_start(const struct extent_map *em) +static inline u64 btrfs_extent_map_block_start(const struct extent_map *em) { if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { - if (extent_map_is_compressed(em)) + if (btrfs_extent_map_is_compressed(em)) return em->disk_bytenr; return em->disk_bytenr + em->offset; } return em->disk_bytenr; } -static inline u64 extent_map_end(const struct extent_map *em) +static inline u64 btrfs_extent_map_end(const struct extent_map *em) { if (em->start + em->len < em->start) return (u64)-1; return em->start + em->len; } -void extent_map_tree_init(struct extent_map_tree *tree); -struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len); -void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em); -int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, - u64 new_logical); - -struct extent_map *alloc_extent_map(void); -void free_extent_map(struct extent_map *em); -int __init extent_map_init(void); -void __cold extent_map_exit(void); -int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen); -void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em); -struct extent_map *search_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len); +void btrfs_extent_map_tree_init(struct extent_map_tree *tree); +struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len); +void btrfs_remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em); +int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, + u64 new_logical); + +struct extent_map *btrfs_alloc_extent_map(void); +void btrfs_free_extent_map(struct extent_map *em); +int __init btrfs_extent_map_init(void); +void __cold btrfs_extent_map_exit(void); +int btrfs_unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen); +void btrfs_clear_em_logging(struct btrfs_inode *inode, struct extent_map *em); +struct extent_map *btrfs_search_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len); int btrfs_add_extent_mapping(struct btrfs_inode *inode, struct extent_map **em_in, u64 start, u64 len); void btrfs_drop_extent_map_range(struct btrfs_inode *inode, diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c index b80c07ad8c5e..7935586a9dbd 100644 --- a/fs/btrfs/fiemap.c +++ b/fs/btrfs/fiemap.c @@ -320,7 +320,7 @@ static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *p * the cost of allocating a new one. */ ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags)); - atomic_inc(&clone->refs); + refcount_inc(&clone->refs); ret = btrfs_next_leaf(inode->root, path); if (ret != 0) @@ -634,7 +634,7 @@ static int extent_fiemap(struct btrfs_inode *inode, const u64 ino = btrfs_ino(inode); struct extent_state *cached_state = NULL; struct extent_state *delalloc_cached_state = NULL; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct fiemap_cache cache = { 0 }; struct btrfs_backref_share_check_ctx *backref_ctx; u64 last_extent_end = 0; @@ -661,7 +661,7 @@ restart: range_end = round_up(start + len, sectorsize); prev_extent_end = range_start; - lock_extent(&inode->io_tree, range_start, range_end, &cached_state); + btrfs_lock_extent(&inode->io_tree, range_start, range_end, &cached_state); ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); if (ret < 0) @@ -841,7 +841,7 @@ check_eof_delalloc: } out_unlock: - unlock_extent(&inode->io_tree, range_start, range_end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, range_start, range_end, &cached_state); if (ret == BTRFS_FIEMAP_FLUSH_CACHE) { btrfs_release_path(path); @@ -871,10 +871,9 @@ out_unlock: ret = emit_last_fiemap_cache(fieinfo, &cache); out: - free_extent_state(delalloc_cached_state); + btrfs_free_extent_state(delalloc_cached_state); kfree(cache.entries); btrfs_free_backref_share_ctx(backref_ctx); - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 886749b39672..c09fbc257634 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -46,7 +46,7 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size) { u64 start, end, i_size; - int ret; + bool found; spin_lock(&inode->lock); i_size = new_i_size ?: i_size_read(&inode->vfs_inode); @@ -55,9 +55,9 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz goto out_unlock; } - ret = find_contiguous_extent_bit(inode->file_extent_tree, 0, &start, - &end, EXTENT_DIRTY); - if (!ret && start == 0) + found = btrfs_find_contiguous_extent_bit(inode->file_extent_tree, 0, &start, + &end, EXTENT_DIRTY); + if (found && start == 0) i_size = min(i_size, end + 1); else i_size = 0; @@ -91,8 +91,8 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize)); - return set_extent_bit(inode->file_extent_tree, start, start + len - 1, - EXTENT_DIRTY, NULL); + return btrfs_set_extent_bit(inode->file_extent_tree, start, start + len - 1, + EXTENT_DIRTY, NULL); } /* @@ -121,8 +121,8 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) || len == (u64)-1); - return clear_extent_bit(inode->file_extent_tree, start, - start + len - 1, EXTENT_DIRTY, NULL); + return btrfs_clear_extent_bit(inode->file_extent_tree, start, + start + len - 1, EXTENT_DIRTY, NULL); } static size_t bytes_to_csum_size(const struct btrfs_fs_info *fs_info, u32 bytes) @@ -163,20 +163,21 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, int ret = 0; struct btrfs_file_extent_item *item; struct btrfs_key file_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; path = btrfs_alloc_path(); if (!path) return -ENOMEM; + file_key.objectid = objectid; - file_key.offset = pos; file_key.type = BTRFS_EXTENT_DATA_KEY; + file_key.offset = pos; ret = btrfs_insert_empty_item(trans, root, path, &file_key, sizeof(*item)); if (ret < 0) - goto out; + return ret; leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -191,9 +192,6 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_encryption(leaf, item, 0); btrfs_set_file_extent_other_encoding(leaf, item, 0); - btrfs_mark_buffer_dirty(trans, leaf); -out: - btrfs_free_path(path); return ret; } @@ -214,8 +212,8 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans, int csums_in_item; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - file_key.offset = bytenr; file_key.type = BTRFS_EXTENT_CSUM_KEY; + file_key.offset = bytenr; ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow); if (ret < 0) goto fail; @@ -261,8 +259,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int cow = mod != 0; file_key.objectid = objectid; - file_key.offset = offset; file_key.type = BTRFS_EXTENT_DATA_KEY; + file_key.offset = offset; return btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); } @@ -338,23 +336,23 @@ out: * * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. */ -blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) +int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio = &bbio->bio; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); const u32 sectorsize = fs_info->sectorsize; const u32 csum_size = fs_info->csum_size; u32 orig_len = bio->bi_iter.bi_size; u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; - blk_status_t ret = BLK_STS_OK; + int ret = 0; u32 bio_offset = 0; if ((inode->flags & BTRFS_INODE_NODATASUM) || test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state)) - return BLK_STS_OK; + return 0; /* * This function is only called for read bio. @@ -371,14 +369,12 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) ASSERT(bio_op(bio) == REQ_OP_READ); path = btrfs_alloc_path(); if (!path) - return BLK_STS_RESOURCE; + return -ENOMEM; if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); - if (!bbio->csum) { - btrfs_free_path(path); - return BLK_STS_RESOURCE; - } + if (!bbio->csum) + return -ENOMEM; } else { bbio->csum = bbio->csum_inline; } @@ -410,7 +406,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) count = search_csum_tree(fs_info, path, cur_disk_bytenr, orig_len - bio_offset, csum_dst); if (count < 0) { - ret = errno_to_blk_status(count); + ret = count; if (bbio->csum != bbio->csum_inline) kfree(bbio->csum); bbio->csum = NULL; @@ -431,12 +427,12 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) memset(csum_dst, 0, csum_size); count = 1; - if (btrfs_root_id(inode->root) == BTRFS_DATA_RELOC_TREE_OBJECTID) { + if (btrfs_is_data_reloc_root(inode->root)) { u64 file_offset = bbio->file_offset + bio_offset; - set_extent_bit(&inode->io_tree, file_offset, - file_offset + sectorsize - 1, - EXTENT_NODATASUM, NULL); + btrfs_set_extent_bit(&inode->io_tree, file_offset, + file_offset + sectorsize - 1, + EXTENT_NODATASUM, NULL); } else { btrfs_warn_rl(fs_info, "csum hole found for disk bytenr range [%llu, %llu)", @@ -446,7 +442,6 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) bio_offset += count * sectorsize; } - btrfs_free_path(path); return ret; } @@ -486,8 +481,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, path->nowait = nowait; key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key.offset = start; key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = start; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -740,7 +735,7 @@ fail: /* * Calculate checksums of the data contained inside a bio. */ -blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) +int btrfs_csum_one_bio(struct btrfs_bio *bbio) { struct btrfs_ordered_extent *ordered = bbio->ordered; struct btrfs_inode *inode = bbio->inode; @@ -762,7 +757,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) memalloc_nofs_restore(nofs_flag); if (!sums) - return BLK_STS_RESOURCE; + return -ENOMEM; sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); @@ -799,11 +794,11 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) * record the updated logical address on Zone Append completion. * Allocate just the structure with an empty sums array here for that case. */ -blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio) +int btrfs_alloc_dummy_sum(struct btrfs_bio *bbio) { bbio->sums = kmalloc(sizeof(*bbio->sums), GFP_NOFS); if (!bbio->sums) - return BLK_STS_RESOURCE; + return -ENOMEM; bbio->sums->len = bbio->bio.bi_iter.bi_size; bbio->sums->logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; btrfs_add_ordered_sum(bbio->ordered, bbio->sums); @@ -876,7 +871,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; u64 end_byte = bytenr + len; u64 csum_end; @@ -894,8 +889,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, while (1) { key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key.offset = end_byte - 1; key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = end_byte - 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { @@ -1012,7 +1007,6 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, } btrfs_release_path(path); } - btrfs_free_path(path); return ret; } @@ -1054,7 +1048,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key file_key; struct btrfs_key found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_csum_item *item; struct btrfs_csum_item *item_end; struct extent_buffer *leaf = NULL; @@ -1076,8 +1070,8 @@ again: found_next = 0; bytenr = sums->logical + total_bytes; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - file_key.offset = bytenr; file_key.type = BTRFS_EXTENT_CSUM_KEY; + file_key.offset = bytenr; item = btrfs_lookup_csum(trans, root, path, bytenr, 1); if (!IS_ERR(item)) { @@ -1259,14 +1253,12 @@ found: ins_size /= csum_size; total_bytes += ins_size * fs_info->sectorsize; - btrfs_mark_buffer_dirty(trans, path->nodes[0]); if (total_bytes < sums->len) { btrfs_release_path(path); cond_resched(); goto again; } out: - btrfs_free_path(path); return ret; } @@ -1304,7 +1296,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, em->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); em->offset = btrfs_file_extent_offset(leaf, fi); if (compress_type != BTRFS_COMPRESS_NONE) { - extent_map_set_compression(em, compress_type); + btrfs_extent_map_set_compression(em, compress_type); } else { /* * Older kernels can create regular non-hole data @@ -1324,7 +1316,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, em->start = 0; em->len = fs_info->sectorsize; em->offset = 0; - extent_map_set_compression(em, compress_type); + btrfs_extent_map_set_compression(em, compress_type); } else { btrfs_err(fs_info, "unknown file extent item type %d, inode %llu, offset %llu, " diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 0e13661a71f3..63216c43676d 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -3,8 +3,10 @@ #ifndef BTRFS_FILE_ITEM_H #define BTRFS_FILE_ITEM_H +#include <linux/blk_types.h> #include <linux/list.h> #include <uapi/linux/btrfs_tree.h> +#include "ctree.h" #include "accessors.h" struct extent_map; @@ -51,7 +53,7 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); -blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio); +int btrfs_lookup_bio_sums(struct btrfs_bio *bbio); int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, u64 num_bytes); @@ -62,8 +64,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); -blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio); -blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio); +int btrfs_csum_one_bio(struct btrfs_bio *bbio); +int btrfs_alloc_dummy_sum(struct btrfs_bio *bbio); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, bool nowait); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 588c353d2969..204674934795 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -36,52 +36,7 @@ #include "ioctl.h" #include "file.h" #include "super.h" - -/* - * Helper to fault in page and copy. This should go away and be replaced with - * calls into generic code. - */ -static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, - struct folio *folio, struct iov_iter *i) -{ - size_t copied = 0; - size_t total_copied = 0; - int offset = offset_in_page(pos); - - while (write_bytes > 0) { - size_t count = min_t(size_t, PAGE_SIZE - offset, write_bytes); - /* - * Copy data from userspace to the current page - */ - copied = copy_folio_from_iter_atomic(folio, offset, count, i); - - /* Flush processor's dcache for this page */ - flush_dcache_folio(folio); - - /* - * if we get a partial write, we can end up with - * partially up to date page. These add - * a lot of complexity, so make sure they don't - * happen by forcing this copy to be retried. - * - * The rest of the btrfs_file_write code will fall - * back to page at a time copies after we return 0. - */ - if (unlikely(copied < count)) { - if (!folio_test_uptodate(folio)) { - iov_iter_revert(i, copied); - copied = 0; - } - if (!copied) - break; - } - - write_bytes -= copied; - total_copied += copied; - offset += copied; - } - return total_copied; -} +#include "print-tree.h" /* * Unlock folio after btrfs_file_write() is done with it. @@ -106,7 +61,7 @@ static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio, } /* - * After btrfs_copy_from_user(), update the following things for delalloc: + * After copy_folio_from_iter_atomic(), update the following things for delalloc: * - Mark newly dirtied folio as DELALLOC in the io tree. * Used to advise which range is to be written back. * - Mark modified folio as Uptodate/Dirty and not needing COW fixup @@ -134,8 +89,7 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos num_bytes = round_up(write_bytes + pos - start_pos, fs_info->sectorsize); ASSERT(num_bytes <= U32_MAX); - ASSERT(folio_pos(folio) <= pos && - folio_pos(folio) + folio_size(folio) >= pos + write_bytes); + ASSERT(folio_pos(folio) <= pos && folio_end(folio) >= pos + write_bytes); end_of_last_block = start_pos + num_bytes - 1; @@ -143,9 +97,9 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos * The pages may have already been dirty, clear out old accounting so * we can set things up properly */ - clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - cached); + btrfs_clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + cached); ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, extra_bits, cached); @@ -224,7 +178,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, if (args->drop_cache) btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false); - if (args->start >= inode->disk_i_size && !args->replace_extent) + if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent) modify_tree = 0; update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); @@ -245,7 +199,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { - BUG_ON(del_nr > 0); + if (WARN_ON(del_nr > 0)) { + btrfs_print_leaf(leaf); + ret = -EINVAL; + break; + } ret = btrfs_next_leaf(root, path); if (ret < 0) break; @@ -321,7 +279,11 @@ next_slot: * | -------- extent -------- | */ if (args->start > key.offset && args->end < extent_end) { - BUG_ON(del_nr > 0); + if (WARN_ON(del_nr > 0)) { + btrfs_print_leaf(leaf); + ret = -EINVAL; + break; + } if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; break; @@ -351,7 +313,6 @@ next_slot: btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - args->start); - btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) { struct btrfs_ref ref = { @@ -397,7 +358,6 @@ next_slot: btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - args->end); - btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) args->bytes_found += args->end - key.offset; break; @@ -409,7 +369,11 @@ next_slot: * | -------- extent -------- | */ if (args->start > key.offset && args->end >= extent_end) { - BUG_ON(del_nr > 0); + if (WARN_ON(del_nr > 0)) { + btrfs_print_leaf(leaf); + ret = -EINVAL; + break; + } if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; break; @@ -417,7 +381,6 @@ next_slot: btrfs_set_file_extent_num_bytes(leaf, fi, args->start - key.offset); - btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) args->bytes_found += extent_end - args->start; if (args->end == extent_end) @@ -437,7 +400,11 @@ delete_extent_item: del_slot = path->slots[0]; del_nr = 1; } else { - BUG_ON(del_slot + del_nr != path->slots[0]); + if (WARN_ON(del_slot + del_nr != path->slots[0])) { + btrfs_print_leaf(leaf); + ret = -EINVAL; + break; + } del_nr++; } @@ -540,20 +507,19 @@ out: return ret; } -static int extent_mergeable(struct extent_buffer *leaf, int slot, - u64 objectid, u64 bytenr, u64 orig_offset, - u64 *start, u64 *end) +static bool extent_mergeable(struct extent_buffer *leaf, int slot, u64 objectid, + u64 bytenr, u64 orig_offset, u64 *start, u64 *end) { struct btrfs_file_extent_item *fi; struct btrfs_key key; u64 extent_end; if (slot < 0 || slot >= btrfs_header_nritems(leaf)) - return 0; + return false; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) - return 0; + return false; fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || @@ -562,15 +528,15 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot, btrfs_file_extent_compression(leaf, fi) || btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)) - return 0; + return false; extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); if ((*start && *start != key.offset) || (*end && *end != extent_end)) - return 0; + return false; *start = key.offset; *end = extent_end; - return 1; + return true; } /* @@ -585,7 +551,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; struct extent_buffer *leaf; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_file_extent_item *fi; struct btrfs_ref ref = { 0 }; struct btrfs_key key; @@ -668,7 +634,6 @@ again: trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, end - other_start); - btrfs_mark_buffer_dirty(trans, leaf); goto out; } } @@ -697,7 +662,6 @@ again: other_end - start); btrfs_set_file_extent_offset(leaf, fi, start - orig_offset); - btrfs_mark_buffer_dirty(trans, leaf); goto out; } } @@ -731,7 +695,6 @@ again: btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); - btrfs_mark_buffer_dirty(trans, leaf); ref.action = BTRFS_ADD_DELAYED_REF; ref.bytenr = bytenr; @@ -810,7 +773,6 @@ again: btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_mark_buffer_dirty(trans, leaf); } else { fi = btrfs_item_ptr(leaf, del_slot - 1, struct btrfs_file_extent_item); @@ -819,7 +781,6 @@ again: btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); - btrfs_mark_buffer_dirty(trans, leaf); ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret < 0) { @@ -828,7 +789,6 @@ again: } } out: - btrfs_free_path(path); return ret; } @@ -837,18 +797,18 @@ out: * On success return a locked folio and 0 */ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos, - u64 len, bool force_uptodate) + u64 len) { u64 clamp_start = max_t(u64, pos, folio_pos(folio)); - u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio)); + u64 clamp_end = min_t(u64, pos + len, folio_end(folio)); + const u32 blocksize = inode_to_fs_info(inode)->sectorsize; int ret = 0; if (folio_test_uptodate(folio)) return 0; - if (!force_uptodate && - IS_ALIGNED(clamp_start, PAGE_SIZE) && - IS_ALIGNED(clamp_end, PAGE_SIZE)) + if (IS_ALIGNED(clamp_start, blocksize) && + IS_ALIGNED(clamp_end, blocksize)) return 0; ret = btrfs_read_folio(NULL, folio); @@ -894,32 +854,27 @@ static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) */ static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret, loff_t pos, size_t write_bytes, - bool force_uptodate, bool nowait) + bool nowait) { - unsigned long index = pos >> PAGE_SHIFT; + const pgoff_t index = pos >> PAGE_SHIFT; gfp_t mask = get_prepare_gfp_flags(inode, nowait); - fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN); + fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) | + fgf_set_order(write_bytes); struct folio *folio; int ret = 0; again: folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask); - if (IS_ERR(folio)) { - if (nowait) - ret = -EAGAIN; - else - ret = PTR_ERR(folio); - return ret; - } - /* Only support page sized folio yet. */ - ASSERT(folio_order(folio) == 0); + if (IS_ERR(folio)) + return PTR_ERR(folio); + ret = set_folio_extent_mapped(folio); if (ret < 0) { folio_unlock(folio); folio_put(folio); return ret; } - ret = prepare_uptodate_folio(inode, folio, pos, write_bytes, force_uptodate); + ret = prepare_uptodate_folio(inode, folio, pos, write_bytes); if (ret) { /* The folio is already unlocked. */ folio_put(folio); @@ -960,14 +915,15 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, struct btrfs_ordered_extent *ordered; if (nowait) { - if (!try_lock_extent(&inode->io_tree, start_pos, last_pos, - cached_state)) { + if (!btrfs_try_lock_extent(&inode->io_tree, start_pos, + last_pos, cached_state)) { folio_unlock(folio); folio_put(folio); return -EAGAIN; } } else { - lock_extent(&inode->io_tree, start_pos, last_pos, cached_state); + btrfs_lock_extent(&inode->io_tree, start_pos, last_pos, + cached_state); } ordered = btrfs_lookup_ordered_range(inode, start_pos, @@ -975,8 +931,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, if (ordered && ordered->file_offset + ordered->num_bytes > start_pos && ordered->file_offset <= last_pos) { - unlock_extent(&inode->io_tree, start_pos, last_pos, - cached_state); + btrfs_unlock_extent(&inode->io_tree, start_pos, last_pos, + cached_state); folio_unlock(folio); folio_put(folio); btrfs_start_ordered_extent(ordered); @@ -1006,6 +962,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, * @pos: File offset. * @write_bytes: The length to write, will be updated to the nocow writeable * range. + * @nowait: Indicate if we can block or not (non-blocking IO context). * * This function will flush ordered extents in the range to ensure proper * nocow checks. @@ -1014,7 +971,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, * > 0 If we can nocow, and updates @write_bytes. * 0 If we can't do a nocow write. * -EAGAIN If we can't do a nocow write because snapshoting of the inode's - * root is in progress. + * root is in progress or because we are in a non-blocking IO + * context and need to block (@nowait is true). * < 0 If an error happened. * * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0. @@ -1026,8 +984,8 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, struct btrfs_root *root = inode->root; struct extent_state *cached_state = NULL; u64 lockstart, lockend; - u64 num_bytes; - int ret; + u64 cur_offset; + int ret = 0; if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) return 0; @@ -1038,7 +996,6 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, lockstart = round_down(pos, fs_info->sectorsize); lockend = round_up(pos + *write_bytes, fs_info->sectorsize) - 1; - num_bytes = lockend - lockstart + 1; if (nowait) { if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend, @@ -1050,14 +1007,35 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, &cached_state); } - ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, - NULL, nowait, false); - if (ret <= 0) - btrfs_drew_write_unlock(&root->snapshot_lock); - else - *write_bytes = min_t(size_t, *write_bytes , - num_bytes - pos + lockstart); - unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + + cur_offset = lockstart; + while (cur_offset < lockend) { + u64 num_bytes = lockend - cur_offset + 1; + + ret = can_nocow_extent(inode, cur_offset, &num_bytes, NULL, nowait); + if (ret <= 0) { + /* + * If cur_offset == lockstart it means we haven't found + * any extent against which we can NOCOW, so unlock the + * snapshot lock. + */ + if (cur_offset == lockstart) + btrfs_drew_write_unlock(&root->snapshot_lock); + break; + } + cur_offset += num_bytes; + } + + btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + + /* + * cur_offset > lockstart means there's at least a partial range we can + * NOCOW, and that range can cover one or more extents. + */ + if (cur_offset > lockstart) { + *write_bytes = min_t(size_t, *write_bytes, cur_offset - pos); + return 1; + } return ret; } @@ -1075,7 +1053,6 @@ int btrfs_write_check(struct kiocb *iocb, size_t count) loff_t pos = iocb->ki_pos; int ret; loff_t oldsize; - loff_t start_pos; /* * Quickly bail out on NOWAIT writes if we don't have the nodatacow or @@ -1102,9 +1079,8 @@ int btrfs_write_check(struct kiocb *iocb, size_t count) inode_inc_iversion(inode); } - start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); - if (start_pos > oldsize) { + if (pos > oldsize) { /* Expand hole size to cover write data, preventing empty gap */ loff_t end_pos = round_up(pos + count, fs_info->sectorsize); @@ -1116,218 +1092,306 @@ int btrfs_write_check(struct kiocb *iocb, size_t count) return 0; } -ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) +static void release_space(struct btrfs_inode *inode, struct extent_changeset *data_reserved, + u64 start, u64 len, bool only_release_metadata) { - struct file *file = iocb->ki_filp; - loff_t pos; - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct extent_changeset *data_reserved = NULL; - u64 release_bytes = 0; - u64 lockstart; - u64 lockend; - size_t num_written = 0; - ssize_t ret; - loff_t old_isize = i_size_read(inode); - unsigned int ilock_flags = 0; - const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); - unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); - bool only_release_metadata = false; + if (len == 0) + return; - if (nowait) - ilock_flags |= BTRFS_ILOCK_TRY; + if (only_release_metadata) { + btrfs_check_nocow_unlock(inode); + btrfs_delalloc_release_metadata(inode, len, true); + } else { + const struct btrfs_fs_info *fs_info = inode->root->fs_info; - ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); - if (ret < 0) - return ret; + btrfs_delalloc_release_space(inode, data_reserved, + round_down(start, fs_info->sectorsize), + len, true); + } +} - ret = generic_write_checks(iocb, i); - if (ret <= 0) - goto out; +/* + * Reserve data and metadata space for this buffered write range. + * + * Return >0 for the number of bytes reserved, which is always block aligned. + * Return <0 for error. + */ +static ssize_t reserve_space(struct btrfs_inode *inode, + struct extent_changeset **data_reserved, + u64 start, size_t *len, bool nowait, + bool *only_release_metadata) +{ + const struct btrfs_fs_info *fs_info = inode->root->fs_info; + const unsigned int block_offset = (start & (fs_info->sectorsize - 1)); + size_t reserve_bytes; + int ret; - ret = btrfs_write_check(iocb, ret); - if (ret < 0) - goto out; + ret = btrfs_check_data_free_space(inode, data_reserved, start, *len, nowait); + if (ret < 0) { + int can_nocow; - pos = iocb->ki_pos; - while (iov_iter_count(i) > 0) { - struct extent_state *cached_state = NULL; - size_t offset = offset_in_page(pos); - size_t sector_offset; - size_t write_bytes = min(iov_iter_count(i), PAGE_SIZE - offset); - size_t reserve_bytes; - size_t copied; - size_t dirty_sectors; - size_t num_sectors; - struct folio *folio = NULL; - int extents_locked; - bool force_page_uptodate = false; + if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) + return -EAGAIN; /* - * Fault pages before locking them in prepare_one_folio() - * to avoid recursive lock + * If we don't have to COW at the offset, reserve metadata only. + * write_bytes may get smaller than requested here. */ - if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) { - ret = -EFAULT; - break; - } + can_nocow = btrfs_check_nocow_lock(inode, start, len, nowait); + if (can_nocow < 0) + ret = can_nocow; + if (can_nocow > 0) + ret = 0; + if (ret) + return ret; + *only_release_metadata = true; + } - only_release_metadata = false; - sector_offset = pos & (fs_info->sectorsize - 1); + reserve_bytes = round_up(*len + block_offset, fs_info->sectorsize); + WARN_ON(reserve_bytes == 0); + ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes, + reserve_bytes, nowait); + if (ret) { + if (!*only_release_metadata) + btrfs_free_reserved_data_space(inode, *data_reserved, + start, *len); + else + btrfs_check_nocow_unlock(inode); - extent_changeset_release(data_reserved); - ret = btrfs_check_data_free_space(BTRFS_I(inode), - &data_reserved, pos, - write_bytes, nowait); - if (ret < 0) { - int can_nocow; + if (nowait && ret == -ENOSPC) + ret = -EAGAIN; + return ret; + } + return reserve_bytes; +} - if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) { - ret = -EAGAIN; - break; - } +/* Shrink the reserved data and metadata space from @reserved_len to @new_len. */ +static void shrink_reserved_space(struct btrfs_inode *inode, + struct extent_changeset *data_reserved, + u64 reserved_start, u64 reserved_len, + u64 new_len, bool only_release_metadata) +{ + const u64 diff = reserved_len - new_len; - /* - * If we don't have to COW at the offset, reserve - * metadata only. write_bytes may get smaller than - * requested here. - */ - can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos, - &write_bytes, nowait); - if (can_nocow < 0) - ret = can_nocow; - if (can_nocow > 0) - ret = 0; - if (ret) - break; - only_release_metadata = true; - } + ASSERT(new_len <= reserved_len); + btrfs_delalloc_shrink_extents(inode, reserved_len, new_len); + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, diff, true); + else + btrfs_delalloc_release_space(inode, data_reserved, + reserved_start + new_len, diff, true); +} - reserve_bytes = round_up(write_bytes + sector_offset, - fs_info->sectorsize); - WARN_ON(reserve_bytes == 0); - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), - reserve_bytes, - reserve_bytes, nowait); - if (ret) { - if (!only_release_metadata) - btrfs_free_reserved_data_space(BTRFS_I(inode), - data_reserved, pos, - write_bytes); - else - btrfs_check_nocow_unlock(BTRFS_I(inode)); +/* Calculate the maximum amount of bytes we can write into one folio. */ +static size_t calc_write_bytes(const struct btrfs_inode *inode, + const struct iov_iter *iter, u64 start) +{ + const size_t max_folio_size = mapping_max_folio_size(inode->vfs_inode.i_mapping); - if (nowait && ret == -ENOSPC) - ret = -EAGAIN; - break; - } + return min(max_folio_size - (start & (max_folio_size - 1)), + iov_iter_count(iter)); +} + +/* + * Do the heavy-lifting work to copy one range into one folio of the page cache. + * + * Return > 0 in case we copied all bytes or just some of them. + * Return 0 if no bytes were copied, in which case the caller should retry. + * Return <0 on error. + */ +static int copy_one_range(struct btrfs_inode *inode, struct iov_iter *iter, + struct extent_changeset **data_reserved, u64 start, + bool nowait) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_state *cached_state = NULL; + size_t write_bytes = calc_write_bytes(inode, iter, start); + size_t copied; + const u64 reserved_start = round_down(start, fs_info->sectorsize); + u64 reserved_len; + struct folio *folio = NULL; + int extents_locked; + u64 lockstart; + u64 lockend; + bool only_release_metadata = false; + const unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); + int ret; + + /* + * Fault all pages before locking them in prepare_one_folio() to avoid + * recursive lock. + */ + if (unlikely(fault_in_iov_iter_readable(iter, write_bytes))) + return -EFAULT; + extent_changeset_release(*data_reserved); + ret = reserve_space(inode, data_reserved, start, &write_bytes, nowait, + &only_release_metadata); + if (ret < 0) + return ret; + reserved_len = ret; + /* Write range must be inside the reserved range. */ + ASSERT(reserved_start <= start); + ASSERT(start + write_bytes <= reserved_start + reserved_len); - release_bytes = reserve_bytes; again: - ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags); - if (ret) { - btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); - break; - } + ret = balance_dirty_pages_ratelimited_flags(inode->vfs_inode.i_mapping, + bdp_flags); + if (ret) { + btrfs_delalloc_release_extents(inode, reserved_len); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + return ret; + } - ret = prepare_one_folio(inode, &folio, pos, write_bytes, - force_page_uptodate, false); - if (ret) { - btrfs_delalloc_release_extents(BTRFS_I(inode), - reserve_bytes); - break; - } + ret = prepare_one_folio(&inode->vfs_inode, &folio, start, write_bytes, false); + if (ret) { + btrfs_delalloc_release_extents(inode, reserved_len); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + return ret; + } - extents_locked = lock_and_cleanup_extent_if_need(BTRFS_I(inode), - folio, pos, write_bytes, &lockstart, - &lockend, nowait, &cached_state); - if (extents_locked < 0) { - if (!nowait && extents_locked == -EAGAIN) - goto again; + /* + * The reserved range goes beyond the current folio, shrink the reserved + * space to the folio boundary. + */ + if (reserved_start + reserved_len > folio_end(folio)) { + const u64 last_block = folio_end(folio); + + shrink_reserved_space(inode, *data_reserved, reserved_start, + reserved_len, last_block - reserved_start, + only_release_metadata); + write_bytes = last_block - start; + reserved_len = last_block - reserved_start; + } + + extents_locked = lock_and_cleanup_extent_if_need(inode, folio, start, + write_bytes, &lockstart, + &lockend, nowait, + &cached_state); + if (extents_locked < 0) { + if (!nowait && extents_locked == -EAGAIN) + goto again; - btrfs_delalloc_release_extents(BTRFS_I(inode), - reserve_bytes); - ret = extents_locked; - break; - } + btrfs_delalloc_release_extents(inode, reserved_len); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + ret = extents_locked; + return ret; + } - copied = btrfs_copy_from_user(pos, write_bytes, folio, i); + copied = copy_folio_from_iter_atomic(folio, offset_in_folio(folio, start), + write_bytes, iter); + flush_dcache_folio(folio); - num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes); - dirty_sectors = round_up(copied + sector_offset, - fs_info->sectorsize); - dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors); + if (unlikely(copied < write_bytes)) { + u64 last_block; - if (copied == 0) { - force_page_uptodate = true; - dirty_sectors = 0; - } else { - force_page_uptodate = false; + /* + * The original write range doesn't need an uptodate folio as + * the range is block aligned. But now a short copy happened. + * We cannot handle it without an uptodate folio. + * + * So just revert the range and we will retry. + */ + if (!folio_test_uptodate(folio)) { + iov_iter_revert(iter, copied); + copied = 0; } - if (num_sectors > dirty_sectors) { - /* release everything except the sectors we dirtied */ - release_bytes -= dirty_sectors << fs_info->sectorsize_bits; - if (only_release_metadata) { - btrfs_delalloc_release_metadata(BTRFS_I(inode), - release_bytes, true); - } else { - u64 release_start = round_up(pos + copied, - fs_info->sectorsize); - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, release_start, - release_bytes, true); - } + /* No copied bytes, unlock, release reserved space and exit. */ + if (copied == 0) { + if (extents_locked) + btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, + &cached_state); + else + btrfs_free_extent_state(cached_state); + btrfs_delalloc_release_extents(inode, reserved_len); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + btrfs_drop_folio(fs_info, folio, start, copied); + return 0; } - release_bytes = round_up(copied + sector_offset, - fs_info->sectorsize); + /* Release the reserved space beyond the last block. */ + last_block = round_up(start + copied, fs_info->sectorsize); - ret = btrfs_dirty_folio(BTRFS_I(inode), folio, pos, copied, - &cached_state, only_release_metadata); + shrink_reserved_space(inode, *data_reserved, reserved_start, + reserved_len, last_block - reserved_start, + only_release_metadata); + reserved_len = last_block - reserved_start; + } - /* - * If we have not locked the extent range, because the range's - * start offset is >= i_size, we might still have a non-NULL - * cached extent state, acquired while marking the extent range - * as delalloc through btrfs_dirty_page(). Therefore free any - * possible cached extent state to avoid a memory leak. - */ - if (extents_locked) - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state); - else - free_extent_state(cached_state); + ret = btrfs_dirty_folio(inode, folio, start, copied, &cached_state, + only_release_metadata); + /* + * If we have not locked the extent range, because the range's start + * offset is >= i_size, we might still have a non-NULL cached extent + * state, acquired while marking the extent range as delalloc through + * btrfs_dirty_page(). Therefore free any possible cached extent state + * to avoid a memory leak. + */ + if (extents_locked) + btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + else + btrfs_free_extent_state(cached_state); - btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); - if (ret) { - btrfs_drop_folio(fs_info, folio, pos, copied); - break; - } + btrfs_delalloc_release_extents(inode, reserved_len); + if (ret) { + btrfs_drop_folio(fs_info, folio, start, copied); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + return ret; + } + if (only_release_metadata) + btrfs_check_nocow_unlock(inode); - release_bytes = 0; - if (only_release_metadata) - btrfs_check_nocow_unlock(BTRFS_I(inode)); + btrfs_drop_folio(fs_info, folio, start, copied); + return copied; +} - btrfs_drop_folio(fs_info, folio, pos, copied); +ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + loff_t pos; + struct inode *inode = file_inode(file); + struct extent_changeset *data_reserved = NULL; + size_t num_written = 0; + ssize_t ret; + loff_t old_isize; + unsigned int ilock_flags = 0; + const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); - cond_resched(); + if (nowait) + ilock_flags |= BTRFS_ILOCK_TRY; - pos += copied; - num_written += copied; - } + ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); + if (ret < 0) + return ret; - if (release_bytes) { - if (only_release_metadata) { - btrfs_check_nocow_unlock(BTRFS_I(inode)); - btrfs_delalloc_release_metadata(BTRFS_I(inode), - release_bytes, true); - } else { - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, - round_down(pos, fs_info->sectorsize), - release_bytes, true); - } + /* + * We can only trust the isize with inode lock held, or it can race with + * other buffered writes and cause incorrect call of + * pagecache_isize_extended() to overwrite existing data. + */ + old_isize = i_size_read(inode); + + ret = generic_write_checks(iocb, iter); + if (ret <= 0) + goto out; + + ret = btrfs_write_check(iocb, ret); + if (ret < 0) + goto out; + + pos = iocb->ki_pos; + while (iov_iter_count(iter) > 0) { + ret = copy_one_range(BTRFS_I(inode), iter, &data_reserved, pos, nowait); + if (ret < 0) + break; + pos += ret; + num_written += ret; + cond_resched(); } extent_changeset_free(data_reserved); @@ -1422,7 +1486,7 @@ int btrfs_release_file(struct inode *inode, struct file *filp) if (private) { kfree(private->filldir_buf); - free_extent_state(private->llseek_cached_state); + btrfs_free_extent_state(private->llseek_cached_state); kfree(private); filp->private_data = NULL; } @@ -1790,27 +1854,25 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct folio *folio = page_folio(page); - struct inode *inode = file_inode(vmf->vma->vm_file); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_inode *inode = BTRFS_I(file_inode(vmf->vma->vm_file)); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; unsigned long zero_start; loff_t size; - vm_fault_t ret; - int ret2; - int reserved = 0; + size_t fsize = folio_size(folio); + int ret; + bool only_release_metadata = false; u64 reserved_space; u64 page_start; u64 page_end; u64 end; - ASSERT(folio_order(folio) == 0); - - reserved_space = PAGE_SIZE; + reserved_space = fsize; - sb_start_pagefault(inode->i_sb); + sb_start_pagefault(inode->vfs_inode.i_sb); page_start = folio_pos(folio); page_end = page_start + folio_size(folio) - 1; end = page_end; @@ -1823,38 +1885,53 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ - ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - page_start, reserved_space); - if (!ret2) { - ret2 = file_update_time(vmf->vma->vm_file); - reserved = 1; - } - if (ret2) { - ret = vmf_error(ret2); - if (reserved) - goto out; + ret = btrfs_check_data_free_space(inode, &data_reserved, page_start, + reserved_space, false); + if (ret < 0) { + size_t write_bytes = reserved_space; + + if (btrfs_check_nocow_lock(inode, page_start, &write_bytes, false) <= 0) + goto out_noreserve; + + only_release_metadata = true; + + /* + * Can't write the whole range, there may be shared extents or + * holes in the range, bail out with @only_release_metadata set + * to true so that we unlock the nocow lock before returning the + * error. + */ + if (write_bytes < reserved_space) + goto out_noreserve; + } + ret = btrfs_delalloc_reserve_metadata(inode, reserved_space, + reserved_space, false); + if (ret < 0) { + if (!only_release_metadata) + btrfs_free_reserved_data_space(inode, data_reserved, + page_start, reserved_space); goto out_noreserve; } - /* Make the VM retry the fault. */ - ret = VM_FAULT_NOPAGE; + ret = file_update_time(vmf->vma->vm_file); + if (ret < 0) + goto out; again: - down_read(&BTRFS_I(inode)->i_mmap_lock); + down_read(&inode->i_mmap_lock); folio_lock(folio); - size = i_size_read(inode); + size = i_size_read(&inode->vfs_inode); - if ((folio->mapping != inode->i_mapping) || + if ((folio->mapping != inode->vfs_inode.i_mapping) || (page_start >= size)) { /* Page got truncated out from underneath us. */ goto out_unlock; } folio_wait_writeback(folio); - lock_extent(io_tree, page_start, page_end, &cached_state); - ret2 = set_folio_extent_mapped(folio); - if (ret2 < 0) { - ret = vmf_error(ret2); - unlock_extent(io_tree, page_start, page_end, &cached_state); + btrfs_lock_extent(io_tree, page_start, page_end, &cached_state); + ret = set_folio_extent_mapped(folio); + if (ret < 0) { + btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); goto out_unlock; } @@ -1862,23 +1939,27 @@ again: * We can't set the delalloc bits if there are pending ordered * extents. Drop our locks and wait for them to finish. */ - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); + ordered = btrfs_lookup_ordered_range(inode, page_start, fsize); if (ordered) { - unlock_extent(io_tree, page_start, page_end, &cached_state); + btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); folio_unlock(folio); - up_read(&BTRFS_I(inode)->i_mmap_lock); + up_read(&inode->i_mmap_lock); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } - if (folio->index == ((size - 1) >> PAGE_SHIFT)) { + if (folio_contains(folio, (size - 1) >> PAGE_SHIFT)) { reserved_space = round_up(size - page_start, fs_info->sectorsize); - if (reserved_space < PAGE_SIZE) { + if (reserved_space < fsize) { + const u64 to_free = fsize - reserved_space; + end = page_start + reserved_space - 1; - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, page_start, - PAGE_SIZE - reserved_space, true); + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, to_free, true); + else + btrfs_delalloc_release_space(inode, data_reserved, + end + 1, to_free, true); } } @@ -1889,15 +1970,13 @@ again: * clear any delalloc bits within this page range since we have to * reserve data&meta space before lock_page() (see above comments). */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, &cached_state); - - ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, - &cached_state); - if (ret2) { - unlock_extent(io_tree, page_start, page_end, &cached_state); - ret = VM_FAULT_SIGBUS; + btrfs_clear_extent_bit(io_tree, page_start, end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, &cached_state); + + ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state); + if (ret < 0) { + btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); goto out_unlock; } @@ -1905,36 +1984,53 @@ again: if (page_start + folio_size(folio) > size) zero_start = offset_in_folio(folio, size); else - zero_start = PAGE_SIZE; + zero_start = fsize; - if (zero_start != PAGE_SIZE) + if (zero_start != fsize) folio_zero_range(folio, zero_start, folio_size(folio) - zero_start); - btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); + btrfs_folio_clear_checked(fs_info, folio, page_start, fsize); btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); - btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); + btrfs_set_inode_last_sub_trans(inode); - unlock_extent(io_tree, page_start, page_end, &cached_state); - up_read(&BTRFS_I(inode)->i_mmap_lock); + if (only_release_metadata) + btrfs_set_extent_bit(io_tree, page_start, end, EXTENT_NORESERVE, + &cached_state); - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - sb_end_pagefault(inode->i_sb); + btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); + up_read(&inode->i_mmap_lock); + + btrfs_delalloc_release_extents(inode, fsize); + if (only_release_metadata) + btrfs_check_nocow_unlock(inode); + sb_end_pagefault(inode->vfs_inode.i_sb); extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; out_unlock: folio_unlock(folio); - up_read(&BTRFS_I(inode)->i_mmap_lock); + up_read(&inode->i_mmap_lock); out: - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, - reserved_space, (ret != 0)); -out_noreserve: - sb_end_pagefault(inode->i_sb); + btrfs_delalloc_release_extents(inode, fsize); + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, reserved_space, true); + else + btrfs_delalloc_release_space(inode, data_reserved, page_start, + reserved_space, true); extent_changeset_free(data_reserved); - return ret; +out_noreserve: + if (only_release_metadata) + btrfs_check_nocow_unlock(inode); + + sb_end_pagefault(inode->vfs_inode.i_sb); + + if (ret < 0) + return vmf_error(ret); + + /* Make the VM retry the fault. */ + return VM_FAULT_NOPAGE; } static const struct vm_operations_struct btrfs_file_vm_ops = { @@ -1943,46 +2039,47 @@ static const struct vm_operations_struct btrfs_file_vm_ops = { .page_mkwrite = btrfs_page_mkwrite, }; -static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) +static int btrfs_file_mmap_prepare(struct vm_area_desc *desc) { + struct file *filp = desc->file; struct address_space *mapping = filp->f_mapping; if (!mapping->a_ops->read_folio) return -ENOEXEC; file_accessed(filp); - vma->vm_ops = &btrfs_file_vm_ops; + desc->vm_ops = &btrfs_file_vm_ops; return 0; } -static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf, - int slot, u64 start, u64 end) +static bool hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf, + int slot, u64 start, u64 end) { struct btrfs_file_extent_item *fi; struct btrfs_key key; if (slot < 0 || slot >= btrfs_header_nritems(leaf)) - return 0; + return false; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) - return 0; + return false; fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) - return 0; + return false; if (btrfs_file_extent_disk_bytenr(leaf, fi)) - return 0; + return false; if (key.offset == end) - return 1; + return true; if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start) - return 1; - return 0; + return true; + return false; } static int fill_holes(struct btrfs_trans_handle *trans, @@ -2028,7 +2125,6 @@ static int fill_holes(struct btrfs_trans_handle *trans, btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_mark_buffer_dirty(trans, leaf); goto out; } @@ -2045,7 +2141,6 @@ static int fill_holes(struct btrfs_trans_handle *trans, btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_mark_buffer_dirty(trans, leaf); goto out; } btrfs_release_path(path); @@ -2058,7 +2153,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); - hole_em = alloc_extent_map(); + hole_em = btrfs_alloc_extent_map(); if (!hole_em) { btrfs_drop_extent_map_range(inode, offset, end - 1, false); btrfs_set_inode_full_sync(inode); @@ -2072,7 +2167,7 @@ out: hole_em->generation = trans->transid; ret = btrfs_replace_extent_map_range(inode, hole_em, true); - free_extent_map(hole_em); + btrfs_free_extent_map(hole_em); if (ret) btrfs_set_inode_full_sync(inode); } @@ -2105,15 +2200,33 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) 0 : *start + *len - em->start - em->len; *start = em->start + em->len; } - free_extent_map(em); + btrfs_free_extent_map(em); return ret; } -static void btrfs_punch_hole_lock_range(struct inode *inode, - const u64 lockstart, - const u64 lockend, - struct extent_state **cached_state) +/* + * Check if there is no folio in the range. + * + * We cannot utilize filemap_range_has_page() in a filemap with large folios + * as we can hit the following false positive: + * + * start end + * | | + * |//|//|//|//| | | | | | | | |//|//| + * \ / \ / + * Folio A Folio B + * + * That large folio A and B cover the start and end indexes. + * In that case filemap_range_has_page() will always return true, but the above + * case is fine for btrfs_punch_hole_lock_range() usage. + * + * So here we only ensure that no other folios is in the range, excluding the + * head/tail large folio. + */ +static bool check_range_has_page(struct inode *inode, u64 start, u64 end) { + struct folio_batch fbatch; + bool ret = false; /* * For subpage case, if the range is not at page boundary, we could * have pages at the leading/tailing part of the range. @@ -2121,15 +2234,48 @@ static void btrfs_punch_hole_lock_range(struct inode *inode, * will always return true. * So here we need to do extra page alignment for * filemap_range_has_page(). + * + * And do not decrease page_lockend right now, as it can be 0. */ - const u64 page_lockstart = round_up(lockstart, PAGE_SIZE); - const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1; + const u64 page_lockstart = round_up(start, PAGE_SIZE); + const u64 page_lockend = round_down(end + 1, PAGE_SIZE); + const pgoff_t start_index = page_lockstart >> PAGE_SHIFT; + const pgoff_t end_index = (page_lockend - 1) >> PAGE_SHIFT; + pgoff_t tmp = start_index; + int found_folios; + + /* The same page or adjacent pages. */ + if (page_lockend <= page_lockstart) + return false; + + folio_batch_init(&fbatch); + found_folios = filemap_get_folios(inode->i_mapping, &tmp, end_index, &fbatch); + for (int i = 0; i < found_folios; i++) { + struct folio *folio = fbatch.folios[i]; + + /* A large folio begins before the start. Not a target. */ + if (folio->index < start_index) + continue; + /* A large folio extends beyond the end. Not a target. */ + if (folio_next_index(folio) > end_index) + continue; + /* A folio doesn't cover the head/tail index. Found a target. */ + ret = true; + break; + } + folio_batch_release(&fbatch); + return ret; +} +static void btrfs_punch_hole_lock_range(struct inode *inode, + const u64 lockstart, const u64 lockend, + struct extent_state **cached_state) +{ while (1) { truncate_pagecache_range(inode, lockstart, lockend); - lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - cached_state); + btrfs_lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state); /* * We can't have ordered extents in the range, nor dirty/writeback * pages, because we have locked the inode's VFS lock in exclusive @@ -2140,12 +2286,11 @@ static void btrfs_punch_hole_lock_range(struct inode *inode, * locking the range check if we have pages in the range, and if * we do, unlock the range and retry. */ - if (!filemap_range_has_page(inode->i_mapping, page_lockstart, - page_lockend)) + if (!check_range_has_page(inode, lockstart, lockend)) break; - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state); } btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend); @@ -2193,7 +2338,6 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_num_bytes(leaf, extent, replace_len); if (extent_info->is_new_extent) btrfs_set_file_extent_generation(leaf, extent, trans->transid); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset, @@ -2259,7 +2403,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1); u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize); struct btrfs_trans_handle *trans = NULL; - struct btrfs_block_rsv *rsv; + struct btrfs_block_rsv rsv; unsigned int rsv_count; u64 cur_offset; u64 len = end - start; @@ -2268,13 +2412,9 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, if (end <= start) return -EINVAL; - rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); - if (!rsv) { - ret = -ENOMEM; - goto out; - } - rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1); - rsv->failfast = true; + btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP); + rsv.size = btrfs_calc_insert_metadata_size(fs_info, 1); + rsv.failfast = true; /* * 1 - update the inode @@ -2291,14 +2431,14 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; - goto out_free; + goto out_release; } - ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, + ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv, min_size, false); if (WARN_ON(ret)) goto out_trans; - trans->block_rsv = rsv; + trans->block_rsv = &rsv; cur_offset = start; drop_args.path = path; @@ -2414,10 +2554,10 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, } ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, - rsv, min_size, false); + &rsv, min_size, false); if (WARN_ON(ret)) break; - trans->block_rsv = rsv; + trans->block_rsv = &rsv; cur_offset = drop_args.drop_end; len = end - cur_offset; @@ -2494,16 +2634,15 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, out_trans: if (!trans) - goto out_free; + goto out_release; trans->block_rsv = &fs_info->trans_block_rsv; if (ret) btrfs_end_transaction(trans); else *trans_out = trans; -out_free: - btrfs_free_block_rsv(fs_info, rsv); -out: +out_release: + btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL); return ret; } @@ -2519,7 +2658,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) u64 lockend; u64 tail_start; u64 tail_len; - u64 orig_start = offset; + const u64 orig_start = offset; + const u64 orig_end = offset + len - 1; int ret = 0; bool same_block; u64 ino_size; @@ -2551,18 +2691,14 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset)) == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)); /* - * We needn't truncate any block which is beyond the end of the file - * because we are sure there is no data there. - */ - /* * Only do this if we are in the same block and we aren't doing the * entire block. */ if (same_block && len < fs_info->sectorsize) { if (offset < ino_size) { truncated_block = true; - ret = btrfs_truncate_block(BTRFS_I(inode), offset, len, - 0); + ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, + orig_start, orig_end); } else { ret = 0; } @@ -2572,7 +2708,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) /* zero back part of the first block */ if (offset < ino_size) { truncated_block = true; - ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), offset, orig_start, orig_end); if (ret) { btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); return ret; @@ -2609,8 +2745,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) if (tail_start + tail_len < ino_size) { truncated_block = true; ret = btrfs_truncate_block(BTRFS_I(inode), - tail_start + tail_len, - 0, 1); + tail_start + tail_len - 1, + orig_start, orig_end); if (ret) goto out_only_mutex; } @@ -2644,8 +2780,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out: - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); out_only_mutex: if (!updated_inode && truncated_block && !ret) { /* @@ -2763,7 +2899,7 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, else ret = RANGE_BOUNDARY_WRITTEN_EXTENT; - free_extent_map(em); + btrfs_free_extent_map(em); return ret; } @@ -2778,6 +2914,8 @@ static int btrfs_zero_range(struct inode *inode, int ret; u64 alloc_hint = 0; const u64 sectorsize = fs_info->sectorsize; + const u64 orig_start = offset; + const u64 orig_end = offset + len - 1; u64 alloc_start = round_down(offset, sectorsize); u64 alloc_end = round_up(offset + len, sectorsize); u64 bytes_to_reserve = 0; @@ -2807,7 +2945,7 @@ static int btrfs_zero_range(struct inode *inode, * do nothing except updating the inode's i_size if * needed. */ - free_extent_map(em); + btrfs_free_extent_map(em); ret = btrfs_fallocate_update_isize(inode, offset + len, mode); goto out; @@ -2820,9 +2958,9 @@ static int btrfs_zero_range(struct inode *inode, ASSERT(IS_ALIGNED(alloc_start, sectorsize)); len = offset + len - alloc_start; offset = alloc_start; - alloc_hint = extent_map_block_start(em) + em->len; + alloc_hint = btrfs_extent_map_block_start(em) + em->len; } - free_extent_map(em); + btrfs_free_extent_map(em); if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { @@ -2833,22 +2971,22 @@ static int btrfs_zero_range(struct inode *inode, } if (em->flags & EXTENT_FLAG_PREALLOC) { - free_extent_map(em); + btrfs_free_extent_map(em); ret = btrfs_fallocate_update_isize(inode, offset + len, mode); goto out; } if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) { - free_extent_map(em); - ret = btrfs_truncate_block(BTRFS_I(inode), offset, len, - 0); + btrfs_free_extent_map(em); + ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, + orig_start, orig_end); if (!ret) ret = btrfs_fallocate_update_isize(inode, offset + len, mode); return ret; } - free_extent_map(em); + btrfs_free_extent_map(em); alloc_start = round_down(offset, sectorsize); alloc_end = alloc_start + sectorsize; goto reserve_space; @@ -2872,7 +3010,8 @@ static int btrfs_zero_range(struct inode *inode, alloc_start = round_down(offset, sectorsize); ret = 0; } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { - ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), offset, + orig_start, orig_end); if (ret) goto out; } else { @@ -2889,8 +3028,8 @@ static int btrfs_zero_range(struct inode *inode, alloc_end = round_up(offset + len, sectorsize); ret = 0; } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { - ret = btrfs_truncate_block(BTRFS_I(inode), offset + len, - 0, 1); + ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, + orig_start, orig_end); if (ret) goto out; } else { @@ -2915,16 +3054,16 @@ reserve_space: ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, alloc_start, bytes_to_reserve); if (ret) { - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state); goto out; } ret = btrfs_prealloc_file_range(inode, mode, alloc_start, alloc_end - alloc_start, fs_info->sectorsize, offset + len, &alloc_hint); - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); /* btrfs_prealloc_file_range releases reserved space on error */ if (ret) { space_reserved = false; @@ -3010,7 +3149,8 @@ static long btrfs_fallocate(struct file *file, int mode, * need to zero out the end of the block if i_size lands in the * middle of a block. */ - ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, + inode->i_size, (u64)-1); if (ret) goto out; } @@ -3035,8 +3175,8 @@ static long btrfs_fallocate(struct file *file, int mode, } locked_end = alloc_end - 1; - lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state); + btrfs_lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state); btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end); @@ -3048,8 +3188,8 @@ static long btrfs_fallocate(struct file *file, int mode, ret = PTR_ERR(em); break; } - last_byte = min(extent_map_end(em), alloc_end); - actual_end = min_t(u64, extent_map_end(em), offset + len); + last_byte = min(btrfs_extent_map_end(em), alloc_end); + actual_end = min_t(u64, btrfs_extent_map_end(em), offset + len); last_byte = ALIGN(last_byte, blocksize); if (em->disk_bytenr == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && @@ -3058,19 +3198,19 @@ static long btrfs_fallocate(struct file *file, int mode, ret = add_falloc_range(&reserve_list, cur_offset, range_len); if (ret < 0) { - free_extent_map(em); + btrfs_free_extent_map(em); break; } ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, cur_offset, range_len); if (ret < 0) { - free_extent_map(em); + btrfs_free_extent_map(em); break; } qgroup_reserved += range_len; data_space_needed += range_len; } - free_extent_map(em); + btrfs_free_extent_map(em); cur_offset = last_byte; } @@ -3124,8 +3264,8 @@ static long btrfs_fallocate(struct file *file, int mode, */ ret = btrfs_fallocate_update_isize(inode, actual_end, mode); out_unlock: - unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state); out: btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); extent_changeset_free(data_reserved); @@ -3159,10 +3299,10 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end if (inode->delalloc_bytes > 0) { spin_unlock(&inode->lock); *delalloc_start_ret = start; - delalloc_len = count_range_bits(&inode->io_tree, - delalloc_start_ret, end, - len, EXTENT_DELALLOC, 1, - cached_state); + delalloc_len = btrfs_count_range_bits(&inode->io_tree, + delalloc_start_ret, end, + len, EXTENT_DELALLOC, 1, + cached_state); } else { spin_unlock(&inode->lock); } @@ -3471,7 +3611,7 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) last_extent_end = lockstart; - lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { @@ -3617,7 +3757,7 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) } out: - unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); btrfs_free_path(path); if (ret < 0) @@ -3682,7 +3822,7 @@ const struct file_operations btrfs_file_operations = { .splice_read = filemap_splice_read, .write_iter = btrfs_file_write_iter, .splice_write = iter_file_splice_write, - .mmap = btrfs_file_mmap, + .mmap_prepare = btrfs_file_mmap_prepare, .open = btrfs_file_open, .release = btrfs_release_file, .get_unmapped_area = thp_get_unmapped_area, diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h index de89e644be29..d7df81388cbe 100644 --- a/fs/btrfs/file.h +++ b/fs/btrfs/file.h @@ -9,6 +9,8 @@ struct file; struct extent_state; struct kiocb; struct iov_iter; +struct inode; +struct folio; struct page; struct btrfs_ioctl_encoded_io_args; struct btrfs_drop_extents_args; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index cfa52ef40b06..5d8d1570a5c9 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -12,7 +12,7 @@ #include <linux/error-injection.h> #include <linux/sched/mm.h> #include <linux/string_choices.h> -#include "ctree.h" +#include "extent-tree.h" #include "fs.h" #include "messages.h" #include "misc.h" @@ -88,13 +88,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_disk_key disk_key; struct btrfs_free_space_header *header; struct extent_buffer *leaf; - struct inode *inode = NULL; + struct btrfs_inode *inode; unsigned nofs_flag; int ret; key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -120,13 +120,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, btrfs_release_path(path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(inode)) - return inode; + return ERR_CAST(inode); - mapping_set_gfp_mask(inode->i_mapping, - mapping_gfp_constraint(inode->i_mapping, + mapping_set_gfp_mask(inode->vfs_inode.i_mapping, + mapping_gfp_constraint(inode->vfs_inode.i_mapping, ~(__GFP_FS | __GFP_HIGHMEM))); - return inode; + return &inode->vfs_inode; } struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, @@ -198,12 +198,11 @@ static int __create_free_space_inode(struct btrfs_root *root, btrfs_set_inode_nlink(leaf, inode_item, 1); btrfs_set_inode_transid(leaf, inode_item, trans->transid); btrfs_set_inode_block_group(leaf, inode_item, offset); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(struct btrfs_free_space_header)); if (ret < 0) { @@ -216,7 +215,6 @@ static int __create_free_space_inode(struct btrfs_root *root, struct btrfs_free_space_header); memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); btrfs_set_free_space_key(leaf, header, &disk_key); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); return 0; @@ -246,7 +244,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_block_group *block_group) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret = 0; @@ -259,12 +257,12 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) { if (PTR_ERR(inode) != -ENOENT) ret = PTR_ERR(inode); - goto out; + return ret; } ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) { btrfs_add_delayed_iput(BTRFS_I(inode)); - goto out; + return ret; } clear_nlink(inode); /* One for the block groups ref */ @@ -287,12 +285,9 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, if (ret) { if (ret > 0) ret = 0; - goto out; + return ret; } - ret = btrfs_del_item(trans, trans->fs_info->tree_root, path); -out: - btrfs_free_path(path); - return ret; + return btrfs_del_item(trans, trans->fs_info->tree_root, path); } int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, @@ -313,8 +308,9 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, bool locked = false; if (block_group) { - struct btrfs_path *path = btrfs_alloc_path(); + BTRFS_PATH_AUTO_FREE(path); + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto fail; @@ -335,13 +331,12 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_CLEAR; spin_unlock(&block_group->lock); - btrfs_free_path(path); } btrfs_i_size_write(inode, 0); truncate_pagecache(vfs_inode, 0); - lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); + btrfs_lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); /* @@ -353,7 +348,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); btrfs_inode_safe_disk_i_size_write(inode, control.last_size); - unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); + btrfs_unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); if (ret) goto fail; @@ -371,7 +366,7 @@ fail: static void readahead_cache(struct inode *inode) { struct file_ra_state ra; - unsigned long last_index; + pgoff_t last_index; file_ra_state_init(&ra, inode->i_mapping); last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; @@ -449,7 +444,7 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) { - struct page *page; + struct folio *folio; struct inode *inode = io_ctl->inode; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int i; @@ -457,31 +452,33 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) for (i = 0; i < io_ctl->num_pages; i++) { int ret; - page = find_or_create_page(inode->i_mapping, i, mask); - if (!page) { + folio = __filemap_get_folio(inode->i_mapping, i, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mask); + if (IS_ERR(folio)) { io_ctl_drop_pages(io_ctl); - return -ENOMEM; + return PTR_ERR(folio); } - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); io_ctl_drop_pages(io_ctl); return ret; } - io_ctl->pages[i] = page; - if (uptodate && !PageUptodate(page)) { - btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (page->mapping != inode->i_mapping) { + io_ctl->pages[i] = &folio->page; + if (uptodate && !folio_test_uptodate(folio)) { + btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (folio->mapping != inode->i_mapping) { btrfs_err(BTRFS_I(inode)->root->fs_info, "free space cache page truncated"); io_ctl_drop_pages(io_ctl); return -EIO; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { btrfs_err(BTRFS_I(inode)->root->fs_info, "error reading free space cache"); io_ctl_drop_pages(io_ctl); @@ -755,8 +752,8 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, return 0; key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -1083,9 +1080,8 @@ int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) { - cluster = list_entry(block_group->cluster_list.next, - struct btrfs_free_cluster, - block_group_list); + cluster = list_first_entry(&block_group->cluster_list, + struct btrfs_free_cluster, block_group_list); } if (!node && cluster) { @@ -1158,13 +1154,13 @@ update_cache_item(struct btrfs_trans_handle *trans, int ret; key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DELALLOC, NULL); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, + EXTENT_DELALLOC, NULL); goto fail; } leaf = path->nodes[0]; @@ -1175,9 +1171,9 @@ update_cache_item(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || found_key.offset != offset) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, - inode->i_size - 1, EXTENT_DELALLOC, - NULL); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, + inode->i_size - 1, EXTENT_DELALLOC, + NULL); btrfs_release_path(path); goto fail; } @@ -1189,7 +1185,6 @@ update_cache_item(struct btrfs_trans_handle *trans, btrfs_set_free_space_entries(leaf, header, entries); btrfs_set_free_space_bitmaps(leaf, header, bitmaps); btrfs_set_free_space_generation(leaf, header, trans->transid); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); return 0; @@ -1223,9 +1218,9 @@ static noinline_for_stack int write_pinned_extent_entries( start = block_group->start; while (start < block_group->start + block_group->length) { - if (!find_first_extent_bit(unpin, start, - &extent_start, &extent_end, - EXTENT_DIRTY, NULL)) + if (!btrfs_find_first_extent_bit(unpin, start, + &extent_start, &extent_end, + EXTENT_DIRTY, NULL)) return 0; /* This pinned extent is out of our range */ @@ -1271,8 +1266,8 @@ static int flush_dirty_cache(struct inode *inode) ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (ret) - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DELALLOC, NULL); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, + EXTENT_DELALLOC, NULL); return ret; } @@ -1292,8 +1287,8 @@ cleanup_write_cache_enospc(struct inode *inode, struct extent_state **cached_state) { io_ctl_drop_pages(io_ctl); - unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + cached_state); } static int __btrfs_wait_cache_io(struct btrfs_root *root, @@ -1418,8 +1413,8 @@ static int __btrfs_write_out_cache(struct inode *inode, if (ret) goto out_unlock; - lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - &cached_state); + btrfs_lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + &cached_state); io_ctl_set_generation(io_ctl, trans->transid); @@ -1479,8 +1474,8 @@ static int __btrfs_write_out_cache(struct inode *inode, io_ctl_drop_pages(io_ctl); io_ctl_free(io_ctl); - unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + &cached_state); /* * at this point the pages are under IO and we're happy, @@ -2346,9 +2341,8 @@ again: struct rb_node *node; struct btrfs_free_space *entry; - cluster = list_entry(block_group->cluster_list.next, - struct btrfs_free_cluster, - block_group_list); + cluster = list_first_entry(&block_group->cluster_list, + struct btrfs_free_cluster, block_group_list); spin_lock(&cluster->lock); node = rb_first(&cluster->root); if (!node) { @@ -3198,7 +3192,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, u64 *max_extent_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - int err; + int ret2; u64 search_start = cluster->window_start; u64 search_bytes = bytes; u64 ret = 0; @@ -3206,8 +3200,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, search_start = min_start; search_bytes = bytes; - err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); - if (err) { + ret2 = search_bitmap(ctl, entry, &search_start, &search_bytes, true); + if (ret2) { *max_extent_size = max(get_max_extent_size(entry), *max_extent_size); return 0; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 7ba50e133921..eba7f22ae49c 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -35,7 +35,7 @@ static struct btrfs_root *btrfs_free_space_root( return btrfs_global_root(block_group->fs_info, &key); } -void set_free_space_tree_thresholds(struct btrfs_block_group *cache) +void btrfs_set_free_space_tree_thresholds(struct btrfs_block_group *cache) { u32 bitmap_range; size_t bitmap_size; @@ -82,23 +82,19 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info)); if (ret) - goto out; + return ret; leaf = path->nodes[0]; info = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_free_space_info); btrfs_set_free_space_extent_count(leaf, info, 0); btrfs_set_free_space_flags(leaf, info, 0); - btrfs_mark_buffer_dirty(trans, leaf); - - ret = 0; -out: btrfs_release_path(path); - return ret; + return 0; } EXPORT_FOR_TESTS -struct btrfs_free_space_info *search_free_space_info( +struct btrfs_free_space_info *btrfs_search_free_space_info( struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path, int cow) @@ -118,7 +114,7 @@ struct btrfs_free_space_info *search_free_space_info( if (ret != 0) { btrfs_warn(fs_info, "missing free space info for %llu", block_group->start); - ASSERT(0); + DEBUG_WARN(); return ERR_PTR(-ENOENT); } @@ -142,12 +138,12 @@ static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans, return ret; if (ret == 0) { - ASSERT(0); + DEBUG_WARN(); return -EIO; } if (p->slots[0] == 0) { - ASSERT(0); + DEBUG_WARN("no previous slot found"); return -EIO; } p->slots[0]--; @@ -202,9 +198,9 @@ static void le_bitmap_set(unsigned long *map, unsigned int start, int len) } EXPORT_FOR_TESTS -int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path) +int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = btrfs_free_space_root(block_group); @@ -224,6 +220,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, bitmap = alloc_bitmap(bitmap_size); if (!bitmap) { ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out; } @@ -236,8 +233,10 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } leaf = path->nodes[0]; nr = 0; @@ -272,22 +271,26 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } btrfs_release_path(path); } - info = search_free_space_info(trans, block_group, path, 1); + info = btrfs_search_free_space_info(trans, block_group, path, 1); if (IS_ERR(info)) { ret = PTR_ERR(info); + btrfs_abort_transaction(trans, ret); goto out; } leaf = path->nodes[0]; flags = btrfs_free_space_flags(leaf, info); flags |= BTRFS_FREE_SPACE_USING_BITMAPS; + block_group->using_free_space_bitmaps = true; + block_group->using_free_space_bitmaps_cached = true; btrfs_set_free_space_flags(leaf, info, flags); expected_extent_count = btrfs_free_space_extent_count(leaf, info); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); if (extent_count != expected_extent_count) { @@ -295,8 +298,8 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, expected_extent_count); - ASSERT(0); ret = -EIO; + btrfs_abort_transaction(trans, ret); goto out; } @@ -317,14 +320,15 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, &key, data_size); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } leaf = path->nodes[0]; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); write_extent_buffer(leaf, bitmap_cursor, ptr, data_size); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); i += extent_size; @@ -334,15 +338,13 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, ret = 0; out: kvfree(bitmap); - if (ret) - btrfs_abort_transaction(trans, ret); return ret; } EXPORT_FOR_TESTS -int convert_free_space_to_extents(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path) +int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = btrfs_free_space_root(block_group); @@ -361,6 +363,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, bitmap = alloc_bitmap(bitmap_size); if (!bitmap) { ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out; } @@ -373,8 +376,10 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } leaf = path->nodes[0]; nr = 0; @@ -403,50 +408,56 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, data_size = free_space_bitmap_size(fs_info, found_key.offset); - ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1); + path->slots[0]--; + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); read_extent_buffer(leaf, bitmap_cursor, ptr, data_size); nr++; - path->slots[0]--; } else { ASSERT(0); } } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } btrfs_release_path(path); } - info = search_free_space_info(trans, block_group, path, 1); + info = btrfs_search_free_space_info(trans, block_group, path, 1); if (IS_ERR(info)) { ret = PTR_ERR(info); + btrfs_abort_transaction(trans, ret); goto out; } leaf = path->nodes[0]; flags = btrfs_free_space_flags(leaf, info); flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS; + block_group->using_free_space_bitmaps = false; + block_group->using_free_space_bitmaps_cached = true; btrfs_set_free_space_flags(leaf, info, flags); expected_extent_count = btrfs_free_space_extent_count(leaf, info); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); - nrbits = block_group->length >> block_group->fs_info->sectorsize_bits; + nrbits = block_group->length >> fs_info->sectorsize_bits; start_bit = find_next_bit_le(bitmap, nrbits, 0); while (start_bit < nrbits) { end_bit = find_next_zero_bit_le(bitmap, nrbits, start_bit); ASSERT(start_bit < end_bit); - key.objectid = start + start_bit * block_group->fs_info->sectorsize; + key.objectid = start + start_bit * fs_info->sectorsize; key.type = BTRFS_FREE_SPACE_EXTENT_KEY; - key.offset = (end_bit - start_bit) * block_group->fs_info->sectorsize; + key.offset = (end_bit - start_bit) * fs_info->sectorsize; ret = btrfs_insert_empty_item(trans, root, path, &key, 0); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } btrfs_release_path(path); extent_count++; @@ -459,16 +470,14 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, expected_extent_count); - ASSERT(0); ret = -EIO; + btrfs_abort_transaction(trans, ret); goto out; } ret = 0; out: kvfree(bitmap); - if (ret) - btrfs_abort_transaction(trans, ret); return ret; } @@ -485,34 +494,31 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans, if (new_extents == 0) return 0; - info = search_free_space_info(trans, block_group, path, 1); - if (IS_ERR(info)) { - ret = PTR_ERR(info); - goto out; - } + info = btrfs_search_free_space_info(trans, block_group, path, 1); + if (IS_ERR(info)) + return PTR_ERR(info); + flags = btrfs_free_space_flags(path->nodes[0], info); extent_count = btrfs_free_space_extent_count(path->nodes[0], info); extent_count += new_extents; btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) && extent_count > block_group->bitmap_high_thresh) { - ret = convert_free_space_to_bitmaps(trans, block_group, path); + ret = btrfs_convert_free_space_to_bitmaps(trans, block_group, path); } else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) && extent_count < block_group->bitmap_low_thresh) { - ret = convert_free_space_to_extents(trans, block_group, path); + ret = btrfs_convert_free_space_to_extents(trans, block_group, path); } -out: return ret; } EXPORT_FOR_TESTS -int free_space_test_bit(struct btrfs_block_group *block_group, - struct btrfs_path *path, u64 offset) +bool btrfs_free_space_test_bit(struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 offset) { struct extent_buffer *leaf; struct btrfs_key key; @@ -530,13 +536,13 @@ int free_space_test_bit(struct btrfs_block_group *block_group, ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); i = div_u64(offset - found_start, block_group->fs_info->sectorsize); - return !!extent_buffer_test_bit(leaf, ptr, i); + return extent_buffer_test_bit(leaf, ptr, i); } -static void free_space_set_bits(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path, u64 *start, u64 *size, - int bit) +static void free_space_modify_bits(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 *start, u64 *size, + bool set_bits) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct extent_buffer *leaf; @@ -560,7 +566,7 @@ static void free_space_set_bits(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); first = (*start - found_start) >> fs_info->sectorsize_bits; last = (end - found_start) >> fs_info->sectorsize_bits; - if (bit) + if (set_bits) extent_buffer_bitmap_set(leaf, ptr, first, last - first); else extent_buffer_bitmap_clear(leaf, ptr, first, last - first); @@ -604,13 +610,14 @@ static int free_space_next_bitmap(struct btrfs_trans_handle *trans, static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path, - u64 start, u64 size, int remove) + u64 start, u64 size, bool remove) { struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_key key; u64 end = start + size; u64 cur_start, cur_size; - int prev_bit, next_bit; + bool prev_bit_set = false; + bool next_bit_set = false; int new_extents; int ret; @@ -627,16 +634,16 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1); if (ret) - goto out; + return ret; - prev_bit = free_space_test_bit(block_group, path, prev_block); + prev_bit_set = btrfs_free_space_test_bit(block_group, path, prev_block); /* The previous block may have been in the previous bitmap. */ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (start >= key.objectid + key.offset) { ret = free_space_next_bitmap(trans, root, path); if (ret) - goto out; + return ret; } } else { key.objectid = start; @@ -645,9 +652,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1); if (ret) - goto out; - - prev_bit = -1; + return ret; } /* @@ -657,13 +662,13 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, cur_start = start; cur_size = size; while (1) { - free_space_set_bits(trans, block_group, path, &cur_start, &cur_size, - !remove); + free_space_modify_bits(trans, block_group, path, &cur_start, + &cur_size, !remove); if (cur_size == 0) break; ret = free_space_next_bitmap(trans, root, path); if (ret) - goto out; + return ret; } /* @@ -676,42 +681,36 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, if (end >= key.objectid + key.offset) { ret = free_space_next_bitmap(trans, root, path); if (ret) - goto out; + return ret; } - next_bit = free_space_test_bit(block_group, path, end); - } else { - next_bit = -1; + next_bit_set = btrfs_free_space_test_bit(block_group, path, end); } if (remove) { new_extents = -1; - if (prev_bit == 1) { + if (prev_bit_set) { /* Leftover on the left. */ new_extents++; } - if (next_bit == 1) { + if (next_bit_set) { /* Leftover on the right. */ new_extents++; } } else { new_extents = 1; - if (prev_bit == 1) { + if (prev_bit_set) { /* Merging with neighbor on the left. */ new_extents--; } - if (next_bit == 1) { + if (next_bit_set) { /* Merging with neighbor on the right. */ new_extents--; } } btrfs_release_path(path); - ret = update_free_space_extent_count(trans, block_group, path, - new_extents); - -out: - return ret; + return update_free_space_extent_count(trans, block_group, path, new_extents); } static int remove_free_space_extent(struct btrfs_trans_handle *trans, @@ -732,7 +731,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans, ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); if (ret) - goto out; + return ret; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); @@ -764,7 +763,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans, /* Delete the existing key (cases 1-4). */ ret = btrfs_del_item(trans, root, path); if (ret) - goto out; + return ret; /* Add a key for leftovers at the beginning (cases 3 and 4). */ if (start > found_start) { @@ -775,7 +774,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); ret = btrfs_insert_empty_item(trans, root, path, &key, 0); if (ret) - goto out; + return ret; new_extents++; } @@ -788,50 +787,58 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); ret = btrfs_insert_empty_item(trans, root, path, &key, 0); if (ret) - goto out; + return ret; new_extents++; } btrfs_release_path(path); - ret = update_free_space_extent_count(trans, block_group, path, - new_extents); - -out: - return ret; + return update_free_space_extent_count(trans, block_group, path, new_extents); } -EXPORT_FOR_TESTS -int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path, u64 start, u64 size) +static int using_bitmaps(struct btrfs_block_group *bg, struct btrfs_path *path) { struct btrfs_free_space_info *info; u32 flags; - int ret; - if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) { - ret = __add_block_group_free_space(trans, block_group, path); - if (ret) - return ret; - } + if (bg->using_free_space_bitmaps_cached) + return bg->using_free_space_bitmaps; - info = search_free_space_info(NULL, block_group, path, 0); + info = btrfs_search_free_space_info(NULL, bg, path, 0); if (IS_ERR(info)) return PTR_ERR(info); flags = btrfs_free_space_flags(path->nodes[0], info); btrfs_release_path(path); - if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + bg->using_free_space_bitmaps = (flags & BTRFS_FREE_SPACE_USING_BITMAPS); + bg->using_free_space_bitmaps_cached = true; + + return bg->using_free_space_bitmaps; +} + +EXPORT_FOR_TESTS +int __btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 start, u64 size) +{ + int ret; + + ret = __add_block_group_free_space(trans, block_group, path); + if (ret) + return ret; + + ret = using_bitmaps(block_group, path); + if (ret < 0) + return ret; + + if (ret) return modify_free_space_bitmap(trans, block_group, path, - start, size, 1); - } else { - return remove_free_space_extent(trans, block_group, path, - start, size); - } + start, size, true); + + return remove_free_space_extent(trans, block_group, path, start, size); } -int remove_from_free_space_tree(struct btrfs_trans_handle *trans, - u64 start, u64 size) +int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, + u64 start, u64 size) { struct btrfs_block_group *block_group; struct btrfs_path *path; @@ -843,26 +850,27 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out; } block_group = btrfs_lookup_block_group(trans->fs_info, start); if (!block_group) { - ASSERT(0); + DEBUG_WARN("no block group found for start=%llu", start); ret = -ENOENT; + btrfs_abort_transaction(trans, ret); goto out; } mutex_lock(&block_group->free_space_lock); - ret = __remove_from_free_space_tree(trans, block_group, path, start, - size); + ret = __btrfs_remove_from_free_space_tree(trans, block_group, path, start, size); mutex_unlock(&block_group->free_space_lock); + if (ret) + btrfs_abort_transaction(trans, ret); btrfs_put_block_group(block_group); out: btrfs_free_path(path); - if (ret) - btrfs_abort_transaction(trans, ret); return ret; } @@ -909,7 +917,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans, ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); if (ret) - goto out; + return ret; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); @@ -932,7 +940,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans, if (found_end == start) { ret = btrfs_del_item(trans, root, path); if (ret) - goto out; + return ret; new_key.objectid = found_start; new_key.offset += key.offset; new_extents--; @@ -949,7 +957,7 @@ right: ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); if (ret) - goto out; + return ret; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); @@ -973,7 +981,7 @@ right: if (found_start == end) { ret = btrfs_del_item(trans, root, path); if (ret) - goto out; + return ret; new_key.offset += key.offset; new_extents--; } @@ -983,48 +991,36 @@ insert: /* Insert the new key (cases 1-4). */ ret = btrfs_insert_empty_item(trans, root, path, &new_key, 0); if (ret) - goto out; + return ret; btrfs_release_path(path); - ret = update_free_space_extent_count(trans, block_group, path, - new_extents); - -out: - return ret; + return update_free_space_extent_count(trans, block_group, path, new_extents); } EXPORT_FOR_TESTS -int __add_to_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path, u64 start, u64 size) +int __btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 start, u64 size) { - struct btrfs_free_space_info *info; - u32 flags; int ret; - if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) { - ret = __add_block_group_free_space(trans, block_group, path); - if (ret) - return ret; - } + ret = __add_block_group_free_space(trans, block_group, path); + if (ret) + return ret; - info = search_free_space_info(NULL, block_group, path, 0); - if (IS_ERR(info)) - return PTR_ERR(info); - flags = btrfs_free_space_flags(path->nodes[0], info); - btrfs_release_path(path); + ret = using_bitmaps(block_group, path); + if (ret < 0) + return ret; - if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + if (ret) return modify_free_space_bitmap(trans, block_group, path, - start, size, 0); - } else { - return add_free_space_extent(trans, block_group, path, start, - size); - } + start, size, false); + + return add_free_space_extent(trans, block_group, path, start, size); } -int add_to_free_space_tree(struct btrfs_trans_handle *trans, - u64 start, u64 size) +int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, + u64 start, u64 size) { struct btrfs_block_group *block_group; struct btrfs_path *path; @@ -1036,25 +1032,27 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out; } block_group = btrfs_lookup_block_group(trans->fs_info, start); if (!block_group) { - ASSERT(0); + DEBUG_WARN("no block group found for start=%llu", start); ret = -ENOENT; + btrfs_abort_transaction(trans, ret); goto out; } mutex_lock(&block_group->free_space_lock); - ret = __add_to_free_space_tree(trans, block_group, path, start, size); + ret = __btrfs_add_to_free_space_tree(trans, block_group, path, start, size); mutex_unlock(&block_group->free_space_lock); + if (ret) + btrfs_abort_transaction(trans, ret); btrfs_put_block_group(block_group); out: btrfs_free_path(path); - if (ret) - btrfs_abort_transaction(trans, ret); return ret; } @@ -1067,7 +1065,8 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group) { struct btrfs_root *extent_root; - struct btrfs_path *path, *path2; + BTRFS_PATH_AUTO_FREE(path); + BTRFS_PATH_AUTO_FREE(path2); struct btrfs_key key; u64 start, end; int ret; @@ -1075,17 +1074,16 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = READA_FORWARD; path2 = btrfs_alloc_path(); - if (!path2) { - btrfs_free_path(path); + if (!path2) return -ENOMEM; - } + + path->reada = READA_FORWARD; ret = add_new_free_space_info(trans, block_group, path2); if (ret) - goto out; + return ret; mutex_lock(&block_group->free_space_lock); @@ -1104,11 +1102,21 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0); if (ret < 0) goto out_locked; - ASSERT(ret == 0); + /* + * If ret is 1 (no key found), it means this is an empty block group, + * without any extents allocated from it and there's no block group + * item (key BTRFS_BLOCK_GROUP_ITEM_KEY) located in the extent tree + * because we are using the block group tree feature, so block group + * items are stored in the block group tree. It also means there are no + * extents allocated for block groups with a start offset beyond this + * block group's end offset (this is the last, highest, block group). + */ + if (!btrfs_fs_compat_ro(trans->fs_info, BLOCK_GROUP_TREE)) + ASSERT(ret == 0); start = block_group->start; end = block_group->start + block_group->length; - while (1) { + while (ret == 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.type == BTRFS_EXTENT_ITEM_KEY || @@ -1117,11 +1125,11 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, break; if (start < key.objectid) { - ret = __add_to_free_space_tree(trans, - block_group, - path2, start, - key.objectid - - start); + ret = __btrfs_add_to_free_space_tree(trans, + block_group, + path2, start, + key.objectid - + start); if (ret) goto out_locked; } @@ -1138,12 +1146,10 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, ret = btrfs_next_item(extent_root, path); if (ret < 0) goto out_locked; - if (ret) - break; } if (start < end) { - ret = __add_to_free_space_tree(trans, block_group, path2, - start, end - start); + ret = __btrfs_add_to_free_space_tree(trans, block_group, path2, + start, end - start); if (ret) goto out_locked; } @@ -1151,9 +1157,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, ret = 0; out_locked: mutex_unlock(&block_group->free_space_lock); -out: - btrfs_free_path(path2); - btrfs_free_path(path); + return ret; } @@ -1222,8 +1226,9 @@ out_clear: static int clear_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; + struct rb_node *node; int nr; int ret; @@ -1238,7 +1243,7 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans, while (1) { ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) - goto out; + return ret; nr = btrfs_header_nritems(path->nodes[0]); if (!nr) @@ -1247,15 +1252,22 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans, path->slots[0] = 0; ret = btrfs_del_items(trans, root, path, 0, nr); if (ret) - goto out; + return ret; btrfs_release_path(path); } - ret = 0; -out: - btrfs_free_path(path); - return ret; + node = rb_first_cached(&trans->fs_info->block_group_cache_tree); + while (node) { + struct btrfs_block_group *bg; + + bg = rb_entry(node, struct btrfs_block_group, cache_node); + clear_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &bg->runtime_flags); + node = rb_next(node); + cond_resched(); + } + + return 0; } int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) @@ -1344,12 +1356,24 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) block_group = rb_entry(node, struct btrfs_block_group, cache_node); + + if (test_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, + &block_group->runtime_flags)) + goto next; + ret = populate_free_space_tree(trans, block_group); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; } +next: + if (btrfs_should_end_transaction(trans)) { + btrfs_end_transaction(trans); + trans = btrfs_start_transaction(free_space_root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + } node = rb_next(node); } @@ -1366,51 +1390,79 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path) { + bool own_path = false; int ret; - clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags); + if (!test_and_clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, + &block_group->runtime_flags)) + return 0; + + /* + * While rebuilding the free space tree we may allocate new metadata + * block groups while modifying the free space tree. + * + * Because during the rebuild (at btrfs_rebuild_free_space_tree()) we + * can use multiple transactions, every time btrfs_end_transaction() is + * called at btrfs_rebuild_free_space_tree() we finish the creation of + * new block groups by calling btrfs_create_pending_block_groups(), and + * that in turn calls us, through add_block_group_free_space(), to add + * a free space info item and a free space extent item for the block + * group. + * + * Then later btrfs_rebuild_free_space_tree() may find such new block + * groups and processes them with populate_free_space_tree(), which can + * fail with EEXIST since there are already items for the block group in + * the free space tree. Notice that we say "may find" because a new + * block group may be added to the block groups rbtree in a node before + * or after the block group currently being processed by the rebuild + * process. So signal the rebuild process to skip such new block groups + * if it finds them. + */ + set_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &block_group->runtime_flags); + + if (!path) { + path = btrfs_alloc_path(); + if (!path) { + btrfs_abort_transaction(trans, -ENOMEM); + return -ENOMEM; + } + own_path = true; + } ret = add_new_free_space_info(trans, block_group, path); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + ret = __btrfs_add_to_free_space_tree(trans, block_group, path, + block_group->start, block_group->length); if (ret) - return ret; + btrfs_abort_transaction(trans, ret); + +out: + if (own_path) + btrfs_free_path(path); - return __add_to_free_space_tree(trans, block_group, path, - block_group->start, - block_group->length); + return ret; } -int add_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group) +int btrfs_add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group) { - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_path *path = NULL; - int ret = 0; + int ret; - if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) return 0; mutex_lock(&block_group->free_space_lock); - if (!test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) - goto out; - - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - - ret = __add_block_group_free_space(trans, block_group, path); - -out: - btrfs_free_path(path); + ret = __add_block_group_free_space(trans, block_group, NULL); mutex_unlock(&block_group->free_space_lock); - if (ret) - btrfs_abort_transaction(trans, ret); return ret; } -int remove_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group) +int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group) { struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_path *path; @@ -1431,6 +1483,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out; } @@ -1443,8 +1496,10 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans, while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } leaf = path->nodes[0]; nr = 0; @@ -1472,16 +1527,16 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans, } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } btrfs_release_path(path); } ret = 0; out: btrfs_free_path(path); - if (ret) - btrfs_abort_transaction(trans, ret); return ret; } @@ -1493,7 +1548,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, struct btrfs_fs_info *fs_info; struct btrfs_root *root; struct btrfs_key key; - int prev_bit = 0, bit; + bool prev_bit_set = false; /* Initialize to silence GCC. */ u64 extent_start = 0; u64 end, offset; @@ -1510,7 +1565,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, while (1) { ret = btrfs_next_item(root, path); if (ret < 0) - goto out; + return ret; if (ret) break; @@ -1524,10 +1579,12 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, offset = key.objectid; while (offset < key.objectid + key.offset) { - bit = free_space_test_bit(block_group, path, offset); - if (prev_bit == 0 && bit == 1) { + bool bit_set; + + bit_set = btrfs_free_space_test_bit(block_group, path, offset); + if (!prev_bit_set && bit_set) { extent_start = offset; - } else if (prev_bit == 1 && bit == 0) { + } else if (prev_bit_set && !bit_set) { u64 space_added; ret = btrfs_add_new_free_space(block_group, @@ -1535,7 +1592,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, offset, &space_added); if (ret) - goto out; + return ret; total_found += space_added; if (total_found > CACHING_CTL_WAKE_UP) { total_found = 0; @@ -1543,14 +1600,14 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, } extent_count++; } - prev_bit = bit; + prev_bit_set = bit_set; offset += fs_info->sectorsize; } } - if (prev_bit == 1) { + if (prev_bit_set) { ret = btrfs_add_new_free_space(block_group, extent_start, end, NULL); if (ret) - goto out; + return ret; extent_count++; } @@ -1559,14 +1616,11 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, expected_extent_count); - ASSERT(0); - ret = -EIO; - goto out; + DEBUG_WARN(); + return -EIO; } - ret = 0; -out: - return ret; + return 0; } static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, @@ -1593,7 +1647,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, ret = btrfs_next_item(root, path); if (ret < 0) - goto out; + return ret; if (ret) break; @@ -1609,7 +1663,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, key.objectid + key.offset, &space_added); if (ret) - goto out; + return ret; total_found += space_added; if (total_found > CACHING_CTL_WAKE_UP) { total_found = 0; @@ -1623,23 +1677,19 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, expected_extent_count); - ASSERT(0); - ret = -EIO; - goto out; + DEBUG_WARN(); + return -EIO; } - ret = 0; -out: - return ret; + return 0; } -int load_free_space_tree(struct btrfs_caching_control *caching_ctl) +int btrfs_load_free_space_tree(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group *block_group; struct btrfs_free_space_info *info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); u32 extent_count, flags; - int ret; block_group = caching_ctl->block_group; @@ -1655,11 +1705,10 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) path->search_commit_root = 1; path->reada = READA_FORWARD; - info = search_free_space_info(NULL, block_group, path, 0); - if (IS_ERR(info)) { - ret = PTR_ERR(info); - goto out; - } + info = btrfs_search_free_space_info(NULL, block_group, path, 0); + if (IS_ERR(info)) + return PTR_ERR(info); + extent_count = btrfs_free_space_extent_count(path->nodes[0], info); flags = btrfs_free_space_flags(path->nodes[0], info); @@ -1669,11 +1718,7 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) * there. */ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) - ret = load_free_space_bitmaps(caching_ctl, path, extent_count); + return load_free_space_bitmaps(caching_ctl, path, extent_count); else - ret = load_free_space_extents(caching_ctl, path, extent_count); - -out: - btrfs_free_path(path); - return ret; + return load_free_space_extents(caching_ctl, path, extent_count); } diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h index e6c6d6f4f221..3d9a5d4477fc 100644 --- a/fs/btrfs/free-space-tree.h +++ b/fs/btrfs/free-space-tree.h @@ -22,39 +22,39 @@ struct btrfs_trans_handle; #define BTRFS_FREE_SPACE_BITMAP_SIZE 256 #define BTRFS_FREE_SPACE_BITMAP_BITS (BTRFS_FREE_SPACE_BITMAP_SIZE * BITS_PER_BYTE) -void set_free_space_tree_thresholds(struct btrfs_block_group *block_group); +void btrfs_set_free_space_tree_thresholds(struct btrfs_block_group *block_group); int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info); int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info); int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info); -int load_free_space_tree(struct btrfs_caching_control *caching_ctl); -int add_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group); -int remove_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group); -int add_to_free_space_tree(struct btrfs_trans_handle *trans, - u64 start, u64 size); -int remove_from_free_space_tree(struct btrfs_trans_handle *trans, - u64 start, u64 size); +int btrfs_load_free_space_tree(struct btrfs_caching_control *caching_ctl); +int btrfs_add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group); +int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group); +int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, + u64 start, u64 size); +int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, + u64 start, u64 size); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_free_space_info * -search_free_space_info(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path, int cow); -int __add_to_free_space_tree(struct btrfs_trans_handle *trans, +btrfs_search_free_space_info(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, - struct btrfs_path *path, u64 start, u64 size); -int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path, u64 start, u64 size); -int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path); -int convert_free_space_to_extents(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path); -int free_space_test_bit(struct btrfs_block_group *block_group, - struct btrfs_path *path, u64 offset); + struct btrfs_path *path, int cow); +int __btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 start, u64 size); +int __btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 start, u64 size); +int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path); +int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path); +bool btrfs_free_space_test_bit(struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 offset); #endif #endif diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index 31c1648bc0b4..b2bb86f8d7cf 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -1,9 +1,138 @@ // SPDX-License-Identifier: GPL-2.0 #include "messages.h" -#include "ctree.h" #include "fs.h" #include "accessors.h" +#include "volumes.h" + +static const struct btrfs_csums { + u16 size; + const char name[10]; + const char driver[12]; +} btrfs_csums[] = { + [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, + [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, + [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, + [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", + .driver = "blake2b-256" }, +}; + +/* This exists for btrfs-progs usages. */ +u16 btrfs_csum_type_size(u16 type) +{ + return btrfs_csums[type].size; +} + +int btrfs_super_csum_size(const struct btrfs_super_block *s) +{ + u16 t = btrfs_super_csum_type(s); + + /* csum type is validated at mount time. */ + return btrfs_csum_type_size(t); +} + +const char *btrfs_super_csum_name(u16 csum_type) +{ + /* csum type is validated at mount time. */ + return btrfs_csums[csum_type].name; +} + +/* + * Return driver name if defined, otherwise the name that's also a valid driver + * name. + */ +const char *btrfs_super_csum_driver(u16 csum_type) +{ + /* csum type is validated at mount time */ + return btrfs_csums[csum_type].driver[0] ? + btrfs_csums[csum_type].driver : + btrfs_csums[csum_type].name; +} + +size_t __attribute_const__ btrfs_get_num_csums(void) +{ + return ARRAY_SIZE(btrfs_csums); +} + +/* + * Start exclusive operation @type, return true on success. + */ +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type) +{ + bool ret = false; + + spin_lock(&fs_info->super_lock); + if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) { + fs_info->exclusive_operation = type; + ret = true; + } + spin_unlock(&fs_info->super_lock); + + return ret; +} + +/* + * Conditionally allow to enter the exclusive operation in case it's compatible + * with the running one. This must be paired with btrfs_exclop_start_unlock() + * and btrfs_exclop_finish(). + * + * Compatibility: + * - the same type is already running + * - when trying to add a device and balance has been paused + * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller + * must check the condition first that would allow none -> @type + */ +bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type) +{ + spin_lock(&fs_info->super_lock); + if (fs_info->exclusive_operation == type || + (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED && + type == BTRFS_EXCLOP_DEV_ADD)) + return true; + + spin_unlock(&fs_info->super_lock); + return false; +} + +void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info) +{ + spin_unlock(&fs_info->super_lock); +} + +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) +{ + spin_lock(&fs_info->super_lock); + WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); + spin_unlock(&fs_info->super_lock); + sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); +} + +void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation op) +{ + switch (op) { + case BTRFS_EXCLOP_BALANCE_PAUSED: + spin_lock(&fs_info->super_lock); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE || + fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD || + fs_info->exclusive_operation == BTRFS_EXCLOP_NONE || + fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); + fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED; + spin_unlock(&fs_info->super_lock); + break; + case BTRFS_EXCLOP_BALANCE: + spin_lock(&fs_info->super_lock); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); + fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; + spin_unlock(&fs_info->super_lock); + break; + default: + btrfs_warn(fs_info, + "invalid exclop balance operation %d requested", op); + } +} void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, const char *name) diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 79a1a3d6f04d..8cc07cc70b12 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -14,10 +14,10 @@ #include <linux/lockdep.h> #include <linux/spinlock.h> #include <linux/mutex.h> -#include <linux/rwlock_types.h> #include <linux/rwsem.h> #include <linux/semaphore.h> #include <linux/list.h> +#include <linux/pagemap.h> #include <linux/radix-tree.h> #include <linux/workqueue.h> #include <linux/wait.h> @@ -47,6 +47,18 @@ struct btrfs_subpage_info; struct btrfs_stripe_hash_table; struct btrfs_space_info; +/* + * Minimum data and metadata block size. + * + * Normally it's 4K, but for testing subpage block size on 4K page systems, we + * allow DEBUG builds to accept 2K page size. + */ +#ifdef CONFIG_BTRFS_DEBUG +#define BTRFS_MIN_BLOCKSIZE (SZ_2K) +#else +#define BTRFS_MIN_BLOCKSIZE (SZ_4K) +#endif + #define BTRFS_MAX_EXTENT_SIZE SZ_128M #define BTRFS_OLDEST_GENERATION 0ULL @@ -105,6 +117,9 @@ enum { /* Indicates there was an error cleaning up a log tree. */ BTRFS_FS_STATE_LOG_CLEANUP_ERROR, + /* No more delayed iput can be queued. */ + BTRFS_FS_STATE_NO_DELAYED_IPUT, + BTRFS_FS_STATE_COUNT }; @@ -285,6 +300,7 @@ enum { #define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) +#define BTRFS_WARNING_COMMIT_INTERVAL (300) #define BTRFS_DEFAULT_MAX_INLINE (2048) struct btrfs_dev_replace { @@ -404,6 +420,8 @@ struct btrfs_commit_stats { u64 last_commit_dur; /* The total commit duration in ns */ u64 total_commit_dur; + /* Start of the last critical section in ns. */ + u64 critical_section_start_time; }; struct btrfs_fs_info { @@ -456,6 +474,8 @@ struct btrfs_fs_info { struct btrfs_block_rsv delayed_block_rsv; /* Block reservation for delayed refs */ struct btrfs_block_rsv delayed_refs_rsv; + /* Block reservation for treelog tree */ + struct btrfs_block_rsv treelog_rsv; struct btrfs_block_rsv empty_block_rsv; @@ -485,8 +505,8 @@ struct btrfs_fs_info { u64 last_trans_log_full_commit; unsigned long long mount_opt; - unsigned long compress_type:4; - unsigned int compress_level; + int compress_type; + int compress_level; u32 commit_interval; /* * It is a suggestive number, the read side is safe even it gets a @@ -627,6 +647,9 @@ struct btrfs_fs_info { struct kobject *qgroups_kobj; struct kobject *discard_kobj; + /* Track the number of blocks (sectors) read by the filesystem. */ + struct percpu_counter stats_read_blocks; + /* Used to keep from writing metadata until there is a nice batch */ struct percpu_counter dirty_metadata_bytes; struct percpu_counter delalloc_bytes; @@ -692,8 +715,6 @@ struct btrfs_fs_info { u32 data_chunk_allocations; u32 metadata_ratio; - void *bdev_holder; - /* Private scrub information */ struct mutex scrub_lock; atomic_t scrubs_running; @@ -706,7 +727,6 @@ struct btrfs_fs_info { * running. */ refcount_t scrub_workers_refcnt; - u32 sectors_per_page; struct workqueue_struct *scrub_workers; struct btrfs_discard_ctl discard_ctl; @@ -719,12 +739,6 @@ struct btrfs_fs_info { spinlock_t qgroup_lock; /* - * Used to avoid frequently calling ulist_alloc()/ulist_free() - * when doing qgroup accounting, it must be protected by qgroup_lock. - */ - struct ulist *qgroup_ulist; - - /* * Protect user change for quota operations. If a transaction is needed, * it must be started before locking this lock. */ @@ -759,10 +773,8 @@ struct btrfs_fs_info { struct btrfs_delayed_root *delayed_root; - /* Extent buffer radix tree */ - spinlock_t buffer_lock; - /* Entries are eb->start / sectorsize */ - struct radix_tree_root buffer_radix; + /* Entries are eb->start >> nodesize_bits */ + struct xarray buffer_tree; /* Next backup root to be overwritten */ int backup_root_index; @@ -793,6 +805,7 @@ struct btrfs_fs_info { /* Cached block sizes */ u32 nodesize; + u32 nodesize_bits; u32 sectorsize; /* ilog2 of sectorsize, use to avoid 64bit division */ u32 sectorsize_bits; @@ -887,6 +900,11 @@ struct btrfs_fs_info { #define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \ struct inode *: (_inode)))->root->fs_info) +static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) +{ + return mapping_gfp_constraint(mapping, ~__GFP_FS); +} + static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info) { return READ_ONCE(fs_info->generation); @@ -953,6 +971,8 @@ static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info, #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ sizeof(struct btrfs_item)) +#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) ((bytes) >> (fs_info)->sectorsize_bits) + static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) { return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && fs_info->zone_size > 0; @@ -971,6 +991,12 @@ static inline u32 count_max_extents(const struct btrfs_fs_info *fs_info, u64 siz return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); } +static inline unsigned int btrfs_blocks_per_folio(const struct btrfs_fs_info *fs_info, + const struct folio *folio) +{ + return folio_size(folio) >> fs_info->sectorsize_bits; +} + bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type); bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, @@ -982,6 +1008,17 @@ void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args); +u16 btrfs_csum_type_size(u16 type); +int btrfs_super_csum_size(const struct btrfs_super_block *s); +const char *btrfs_super_csum_name(u16 csum_type); +const char *btrfs_super_csum_driver(u16 csum_type); +size_t __attribute_const__ btrfs_get_num_csums(void); + +static inline bool btrfs_is_empty_uuid(const u8 *uuid) +{ + return uuid_is_null((const uuid_t *)uuid); +} + /* Compatibility and incompatibility defines */ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, const char *name); @@ -1058,6 +1095,14 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ &(fs_info)->fs_state))) +/* + * We use folio flag owner_2 to indicate there is an ordered extent with + * unfinished IO. + */ +#define folio_test_ordered(folio) folio_test_owner_2(folio) +#define folio_set_ordered(folio) folio_set_owner_2(folio) +#define folio_clear_ordered(folio) folio_clear_owner_2(folio) + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS #define EXPORT_FOR_TESTS diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 29572dfaf878..f06cf701ae5a 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -78,13 +78,10 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( } /* Returns NULL if no extref found */ -struct btrfs_inode_extref * -btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const struct fscrypt_str *name, - u64 inode_objectid, u64 ref_objectid, int ins_len, - int cow) +struct btrfs_inode_extref *btrfs_lookup_inode_extref(struct btrfs_root *root, + struct btrfs_path *path, + const struct fscrypt_str *name, + u64 inode_objectid, u64 ref_objectid) { int ret; struct btrfs_key key; @@ -93,7 +90,7 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, key.type = BTRFS_INODE_EXTREF_KEY; key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len); - ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return ERR_PTR(ret); if (ret > 0) @@ -109,7 +106,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, u64 inode_objectid, u64 ref_objectid, u64 *index) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_inode_extref *extref; struct extent_buffer *leaf; @@ -129,9 +126,9 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) - ret = -ENOENT; + return -ENOENT; if (ret < 0) - goto out; + return ret; /* * Sanity check - did we find the right item for this name? @@ -142,8 +139,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, ref_objectid, name); if (!extref) { btrfs_abort_transaction(trans, -ENOENT); - ret = -ENOENT; - goto out; + return -ENOENT; } leaf = path->nodes[0]; @@ -152,12 +148,8 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, *index = btrfs_inode_extref_index(leaf, extref); if (del_len == item_size) { - /* - * Common case only one ref in the item, remove the - * whole item. - */ - ret = btrfs_del_item(trans, root, path); - goto out; + /* Common case only one ref in the item, remove the whole item. */ + return btrfs_del_item(trans, root, path); } ptr = (unsigned long)extref; @@ -168,9 +160,6 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, btrfs_truncate_item(trans, path, item_size - del_len, 1); -out: - btrfs_free_path(path); - return ret; } @@ -191,8 +180,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, int del_len = name->len + sizeof(*ref); key.objectid = inode_objectid; - key.offset = ref_objectid; key.type = BTRFS_INODE_REF_KEY; + key.offset = ref_objectid; path = btrfs_alloc_path(); if (!path) @@ -260,7 +249,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, int ret; int ins_len = name->len + sizeof(*extref); unsigned long ptr; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *leaf; @@ -279,13 +268,13 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, path->slots[0], ref_objectid, name)) - goto out; + return ret; btrfs_extend_item(trans, path, ins_len); ret = 0; } if (ret < 0) - goto out; + return ret; leaf = path->nodes[0]; ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char); @@ -298,11 +287,8 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, ptr = (unsigned long)&extref->name; write_extent_buffer(path->nodes[0], name->name, ptr, name->len); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); -out: - btrfs_free_path(path); - return ret; + return 0; } /* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */ @@ -319,8 +305,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, int ins_len = name->len + sizeof(*ref); key.objectid = inode_objectid; - key.offset = ref_objectid; key.type = BTRFS_INODE_REF_KEY; + key.offset = ref_objectid; path = btrfs_alloc_path(); if (!path) @@ -363,8 +349,6 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, ptr = (unsigned long)(ref + 1); } write_extent_buffer(path->nodes[0], name->name, ptr, name->len); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); - out: btrfs_free_path(path); @@ -497,8 +481,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, path->reada = READA_BACK; key.objectid = control->ino; - key.offset = (u64)-1; key.type = (u8)-1; + key.offset = (u64)-1; search_again: /* @@ -590,7 +574,6 @@ search_again: num_dec = (orig_num_bytes - extent_num_bytes); if (extent_start != 0) control->sub_bytes += num_dec; - btrfs_mark_buffer_dirty(trans, leaf); } else { extent_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); @@ -734,13 +717,12 @@ delete: } out: if (ret >= 0 && pending_del_nr) { - int err; + int ret2; - err = btrfs_del_items(trans, root, path, pending_del_slot, - pending_del_nr); - if (err) { - btrfs_abort_transaction(trans, err); - ret = err; + ret2 = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); + if (ret2) { + btrfs_abort_transaction(trans, ret2); + ret = ret2; } } diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h index c11b97fdccc4..6d9f5ad20646 100644 --- a/fs/btrfs/inode-item.h +++ b/fs/btrfs/inode-item.h @@ -101,13 +101,10 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *location, int mod); -struct btrfs_inode_extref *btrfs_lookup_inode_extref( - struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const struct fscrypt_str *name, - u64 inode_objectid, u64 ref_objectid, int ins_len, - int cow); +struct btrfs_inode_extref *btrfs_lookup_inode_extref(struct btrfs_root *root, + struct btrfs_path *path, + const struct fscrypt_str *name, + u64 inode_objectid, u64 ref_objectid); struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf, int slot, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 03fe0de2cd0d..b77dd22b8cdb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -308,7 +308,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, const u32 csum_size = root->fs_info->csum_size; /* For data reloc tree, it's better to do a backref lookup instead. */ - if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_is_data_reloc_root(root)) return print_data_reloc_error(inode, logical_start, csum, csum_expected, mirror_num); @@ -393,34 +393,13 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) * extent (btrfs_finish_ordered_io()). */ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, - struct folio *locked_folio, u64 offset, u64 bytes) { - unsigned long index = offset >> PAGE_SHIFT; - unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; - u64 page_start = 0, page_end = 0; + pgoff_t index = offset >> PAGE_SHIFT; + const pgoff_t end_index = (offset + bytes - 1) >> PAGE_SHIFT; struct folio *folio; - if (locked_folio) { - page_start = folio_pos(locked_folio); - page_end = page_start + folio_size(locked_folio) - 1; - } - while (index <= end_index) { - /* - * For locked page, we will call btrfs_mark_ordered_io_finished - * through btrfs_mark_ordered_io_finished() on it - * in run_delalloc_range() for the error handling, which will - * clear page Ordered and run the ordered extent accounting. - * - * Here we can't just clear the Ordered bit, or - * btrfs_mark_ordered_io_finished() would skip the accounting - * for the page range, and the ordered extent will never finish. - */ - if (locked_folio && index == (page_start >> PAGE_SHIFT)) { - index++; - continue; - } folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); index++; if (IS_ERR(folio)) @@ -436,23 +415,6 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, folio_put(folio); } - if (locked_folio) { - /* The locked page covers the full range, nothing needs to be done */ - if (bytes + offset <= page_start + folio_size(locked_folio)) - return; - /* - * In case this page belongs to the delalloc range being - * instantiated then skip it, since the first page of a range is - * going to be properly cleaned up by the caller of - * run_delalloc_range - */ - if (page_start >= offset && page_end <= (offset + bytes - 1)) { - bytes = offset + bytes - folio_pos(locked_folio) - - folio_size(locked_folio); - offset = folio_pos(locked_folio) + folio_size(locked_folio); - } - } - return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); } @@ -461,18 +423,18 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct btrfs_new_inode_args *args) { - int err; + int ret; if (args->default_acl) { - err = __btrfs_set_acl(trans, args->inode, args->default_acl, + ret = __btrfs_set_acl(trans, args->inode, args->default_acl, ACL_TYPE_DEFAULT); - if (err) - return err; + if (ret) + return ret; } if (args->acl) { - err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS); - if (err) - return err; + ret = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS); + if (ret) + return ret; } if (!args->default_acl && !args->acl) cache_no_acl(args->inode); @@ -527,8 +489,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, size_t datasize; key.objectid = btrfs_ino(inode); - key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; datasize = btrfs_file_extent_calc_inline_size(cur_size); ret = btrfs_insert_empty_item(trans, root, path, &key, @@ -564,7 +526,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, kunmap_local(kaddr); folio_put(folio); } - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* @@ -605,23 +566,14 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode, if (offset != 0) return false; - /* - * Due to the page size limit, for subpage we can only trigger the - * writeback for the dirty sectors of page, that means data writeback - * is doing more writeback than what we want. - * - * This is especially unexpected for some call sites like fallocate, - * where we only increase i_size after everything is done. - * This means we can trigger inline extent even if we didn't want to. - * So here we skip inline extent creation completely. - */ - if (fs_info->sectorsize != PAGE_SIZE) - return false; - /* Inline extents are limited to sectorsize. */ if (size > fs_info->sectorsize) return false; + /* We do not allow a non-compressed extent to be as large as block size. */ + if (data_len >= fs_info->sectorsize) + return false; + /* We cannot exceed the maximum inline data size. */ if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) return false; @@ -711,7 +663,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL); + btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; @@ -734,12 +686,12 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, if (!can_cow_file_range_inline(inode, offset, size, compressed_size)) return 1; - lock_extent(&inode->io_tree, offset, end, &cached); + btrfs_lock_extent(&inode->io_tree, offset, end, &cached); ret = __cow_file_range_inline(inode, size, compressed_size, compress_type, compressed_folio, update_i_size); if (ret > 0) { - unlock_extent(&inode->io_tree, offset, end, &cached); + btrfs_unlock_extent(&inode->io_tree, offset, end, &cached); return ret; } @@ -825,33 +777,19 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, struct btrfs_fs_info *fs_info = inode->root->fs_info; if (!btrfs_inode_can_compress(inode)) { - WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), - KERN_ERR "BTRFS: unexpected compression for ino %llu\n", - btrfs_ino(inode)); + DEBUG_WARN("BTRFS: unexpected compression for ino %llu", btrfs_ino(inode)); return 0; } - /* - * Only enable sector perfect compression for experimental builds. - * - * This is a big feature change for subpage cases, and can hit - * different corner cases, so only limit this feature for - * experimental build for now. - * - * ETA for moving this out of experimental builds is 6.15. - */ - if (fs_info->sectorsize < PAGE_SIZE && - !IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) { - if (!PAGE_ALIGNED(start) || - !PAGE_ALIGNED(end + 1)) - return 0; - } + /* Defrag ioctl takes precedence over mount options and properties. */ + if (inode->defrag_compress == BTRFS_DEFRAG_DONT_COMPRESS) + return 0; + if (BTRFS_COMPRESS_NONE < inode->defrag_compress && + inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) + return 1; /* force compress */ if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) return 1; - /* defrag ioctl */ - if (inode->defrag_compress) - return 1; /* bad compression ratios */ if (inode->flags & BTRFS_INODE_NOCOMPRESS) return 0; @@ -871,21 +809,20 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, btrfs_add_inode_defrag(inode, small_write); } -static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end) { - unsigned long end_index = end >> PAGE_SHIFT; + const pgoff_t end_index = end >> PAGE_SHIFT; struct folio *folio; int ret = 0; - for (unsigned long index = start >> PAGE_SHIFT; - index <= end_index; index++) { - folio = filemap_get_folio(inode->i_mapping, index); + for (pgoff_t index = start >> PAGE_SHIFT; index <= end_index; index++) { + folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); if (IS_ERR(folio)) { if (!ret) ret = PTR_ERR(folio); continue; } - btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start, + btrfs_folio_clamp_clear_dirty(inode->root->fs_info, folio, start, end + 1 - start); folio_put(folio); } @@ -925,6 +862,7 @@ static void compress_file_range(struct btrfs_work *work) unsigned int poff; int i; int compress_type = fs_info->compress_type; + int compress_level = fs_info->compress_level; inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); @@ -933,7 +871,7 @@ static void compress_file_range(struct btrfs_work *work) * Otherwise applications with the file mmap'd can wander in and change * the page contents while we are compressing them. */ - ret = extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); + ret = extent_range_clear_dirty_for_io(inode, start, end); /* * All the folios should have been locked thus no failure. @@ -1007,13 +945,15 @@ again: goto cleanup_and_bail_uncompressed; } - if (inode->defrag_compress) + if (0 < inode->defrag_compress && inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) { compress_type = inode->defrag_compress; - else if (inode->prop_compress) + compress_level = inode->defrag_compress_level; + } else if (inode->prop_compress) { compress_type = inode->prop_compress; + } /* Compression level is applied here. */ - ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4), + ret = btrfs_compress_folios(compress_type, compress_level, mapping, start, folios, &nr_folios, &total_in, &total_compressed); if (ret) @@ -1129,19 +1069,13 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { - btrfs_cleanup_ordered_extents(inode, locked_folio, - start, end - start + 1); - if (locked_folio) { - const u64 page_start = folio_pos(locked_folio); - - folio_start_writeback(locked_folio); - folio_end_writeback(locked_folio); - btrfs_mark_ordered_io_finished(inode, locked_folio, - page_start, PAGE_SIZE, - !ret); - mapping_set_error(locked_folio->mapping, ret); - folio_unlock(locked_folio); - } + if (locked_folio) + btrfs_folio_end_lock(inode->root->fs_info, locked_folio, + start, async_extent->ram_size); + btrfs_err_rl(inode->root->fs_info, + "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), start, async_extent->ram_size, ret); } } @@ -1160,6 +1094,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, struct extent_state *cached = NULL; struct extent_map *em; int ret = 0; + bool free_pages = false; u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; @@ -1180,7 +1115,10 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, } if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { + ASSERT(!async_extent->folios); + ASSERT(async_extent->nr_folios == 0); submit_uncompressed_range(inode, async_extent, locked_folio); + free_pages = true; goto done; } @@ -1196,10 +1134,11 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, * fall back to uncompressed. */ submit_uncompressed_range(inode, async_extent, locked_folio); + free_pages = true; goto done; } - lock_extent(io_tree, start, end, &cached); + btrfs_lock_extent(io_tree, start, end, &cached); /* Here we're doing allocation and writeback of the compressed pages */ file_extent.disk_bytenr = ins.objectid; @@ -1214,10 +1153,10 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, ret = PTR_ERR(em); goto out_free_reserve; } - free_extent_map(em); + btrfs_free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, - 1 << BTRFS_ORDERED_COMPRESSED); + 1U << BTRFS_ORDERED_COMPRESSED); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); @@ -1237,12 +1176,14 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, done: if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); + if (free_pages) + free_async_extent_pages(async_extent); kfree(async_extent); return; out_free_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); mapping_set_error(inode->vfs_inode.i_mapping, -EIO); extent_clear_unlock_delalloc(inode, start, end, NULL, &cached, @@ -1269,7 +1210,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, u64 alloc_hint = 0; read_lock(&em_tree->lock); - em = search_extent_mapping(em_tree, start, num_bytes); + em = btrfs_search_extent_mapping(em_tree, start, num_bytes); if (em) { /* * if block start isn't an actual block number then find the @@ -1277,15 +1218,15 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * block is also bogus then just don't worry about it. */ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { - free_extent_map(em); - em = search_extent_mapping(em_tree, 0, 0); + btrfs_free_extent_map(em); + em = btrfs_search_extent_mapping(em_tree, 0, 0); if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE) - alloc_hint = extent_map_block_start(em); + alloc_hint = btrfs_extent_map_block_start(em); if (em) - free_extent_map(em); + btrfs_free_extent_map(em); } else { - alloc_hint = extent_map_block_start(em); - free_extent_map(em); + alloc_hint = btrfs_extent_map_block_start(em); + btrfs_free_extent_map(em); } } read_unlock(&em_tree->lock); @@ -1316,10 +1257,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * - Else all pages except for @locked_folio are unlocked. * * When a failure happens in the second or later iteration of the - * while-loop, the ordered extents created in previous iterations are kept - * intact. So, the caller must clean them up by calling - * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for - * example. + * while-loop, the ordered extents created in previous iterations are cleaned up. */ static noinline int cow_file_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, @@ -1373,6 +1311,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode, alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes); /* + * We're not doing compressed IO, don't unlock the first page (which + * the caller expects to stay locked), don't clear any dirty bits and + * don't set any writeback bits. + * + * Do set the Ordered (Private2) bit so we know this page was properly + * setup for writepage. + */ + page_ops = (keep_locked ? 0 : PAGE_UNLOCK); + page_ops |= PAGE_SET_ORDERED; + + /* * Relocation relies on the relocated extents to have exactly the same * size as the original extents. Normally writeback for relocation data * extents follows a NOCOW path because relocation preallocates the @@ -1415,8 +1364,13 @@ static noinline int cow_file_range(struct btrfs_inode *inode, continue; } if (done_offset) { - *done_offset = start - 1; - return 0; + /* + * Move @end to the end of the processed range, + * and exit the loop to unlock the processed extents. + */ + end = start - 1; + ret = 0; + break; } ret = -ENOSPC; } @@ -1431,24 +1385,28 @@ static noinline int cow_file_range(struct btrfs_inode *inode, file_extent.offset = 0; file_extent.compression = BTRFS_COMPRESS_NONE; - lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, - &cached); + /* + * Locked range will be released either during error clean up or + * after the whole range is finished. + */ + btrfs_lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, + &cached); em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_REGULAR); if (IS_ERR(em)) { - unlock_extent(&inode->io_tree, start, - start + cur_alloc_size - 1, &cached); + btrfs_unlock_extent(&inode->io_tree, start, + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(em); goto out_reserve; } - free_extent_map(em); + btrfs_free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, - 1 << BTRFS_ORDERED_REGULAR); + 1U << BTRFS_ORDERED_REGULAR); if (IS_ERR(ordered)) { - unlock_extent(&inode->io_tree, start, - start + cur_alloc_size - 1, &cached); + btrfs_unlock_extent(&inode->io_tree, start, + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(ordered); goto out_drop_extent_cache; } @@ -1476,21 +1434,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, btrfs_dec_block_group_reservations(fs_info, ins.objectid); - /* - * We're not doing compressed IO, don't unlock the first page - * (which the caller expects to stay locked), don't clear any - * dirty bits and don't set any writeback bits - * - * Do set the Ordered flag so we know this page was - * properly setup for writepage. - */ - page_ops = (keep_locked ? 0 : PAGE_UNLOCK); - page_ops |= PAGE_SET_ORDERED; - - extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, - locked_folio, &cached, - EXTENT_LOCKED | EXTENT_DELALLOC, - page_ops); if (num_bytes < cur_alloc_size) num_bytes = 0; else @@ -1507,6 +1450,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret) goto out_unlock; } + extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached, + EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); done: if (done_offset) *done_offset = end; @@ -1516,7 +1461,7 @@ out_drop_extent_cache: btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); out_unlock: /* * Now, we have three regions to clean up: @@ -1527,35 +1472,30 @@ out_unlock: * We process each region below. */ - clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; - page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; - /* * For the range (1). We have already instantiated the ordered extents - * for this region. They are cleaned up by - * btrfs_cleanup_ordered_extents() in e.g, - * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are - * already cleared in the above loop. And, EXTENT_DELALLOC_NEW | - * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup - * function. + * for this region, thus we need to cleanup those ordered extents. + * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV + * are also handled by the ordered extents cleanup. * - * However, in case of @keep_locked, we still need to unlock the pages - * (except @locked_folio) to ensure all the pages are unlocked. + * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and + * finish the writeback of the involved folios, which will be never submitted. */ - if (keep_locked && orig_start < start) { + if (orig_start < start) { + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; + if (!locked_folio) mapping_set_error(inode->vfs_inode.i_mapping, ret); + + btrfs_cleanup_ordered_extents(inode, orig_start, start - orig_start); extent_clear_unlock_delalloc(inode, orig_start, start - 1, - locked_folio, NULL, 0, page_ops); + locked_folio, NULL, clear_bits, page_ops); } - /* - * At this point we're unlocked, we want to make sure we're only - * clearing these flags under the extent lock, so lock the rest of the - * range and clear everything up. - */ - lock_extent(&inode->io_tree, start, end, NULL); + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; /* * For the range (2). If we reserved an extent for our delalloc range @@ -1589,6 +1529,10 @@ out_unlock: btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size, end - start - cur_alloc_size + 1, NULL); } + btrfs_err_rl(fs_info, + "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), orig_start, end + 1 - orig_start, ret); return ret; } @@ -1626,8 +1570,8 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_ PAGE_SHIFT; while (!list_empty(&async_chunk->extents)) { - async_extent = list_entry(async_chunk->extents.next, - struct async_extent, list); + async_extent = list_first_entry(&async_chunk->extents, + struct async_extent, list); list_del(&async_extent->list); submit_one_async_extent(async_chunk, async_extent, &alloc_hint); } @@ -1797,9 +1741,9 @@ static int fallback_to_cow(struct btrfs_inode *inode, * group that contains that extent to RO mode and therefore force COW * when starting writeback. */ - lock_extent(io_tree, start, end, &cached_state); - count = count_range_bits(io_tree, &range_start, end, range_bytes, - EXTENT_NORESERVE, 0, NULL); + btrfs_lock_extent(io_tree, start, end, &cached_state); + count = btrfs_count_range_bits(io_tree, &range_start, end, range_bytes, + EXTENT_NORESERVE, 0, NULL); if (count > 0 || is_space_ino || is_reloc_ino) { u64 bytes = count; struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1809,14 +1753,14 @@ static int fallback_to_cow(struct btrfs_inode *inode, bytes = range_bytes; spin_lock(&sinfo->lock); - btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes); + btrfs_space_info_update_bytes_may_use(sinfo, bytes); spin_unlock(&sinfo->lock); if (count > 0) - clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, - NULL); + btrfs_clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, + &cached_state); } - unlock_extent(io_tree, start, end, &cached_state); + btrfs_unlock_extent(io_tree, start, end, &cached_state); /* * Don't try to create inline extents, as a mix of inline extent that @@ -1837,7 +1781,6 @@ struct can_nocow_file_extent_args { /* End file offset (inclusive) of the range we want to NOCOW. */ u64 end; bool writeback_path; - bool strict; /* * Free the path passed to can_nocow_file_extent() once it's not needed * anymore. @@ -1892,8 +1835,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, * for its subvolume was created, then this implies the extent is shared, * hence we must COW. */ - if (!args->strict && - btrfs_file_extent_generation(leaf, fi) <= + if (btrfs_file_extent_generation(leaf, fi) <= btrfs_root_last_snapshot(&root->root_item)) goto out; @@ -1922,9 +1864,8 @@ static int can_nocow_file_extent(struct btrfs_path *path, */ btrfs_release_path(path); - ret = btrfs_cross_ref_exist(root, btrfs_ino(inode), - key->offset - args->file_extent.offset, - args->file_extent.disk_bytenr, args->strict, path); + ret = btrfs_cross_ref_exist(inode, key->offset - args->file_extent.offset, + args->file_extent.disk_bytenr, path); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out; @@ -1971,6 +1912,112 @@ static int can_nocow_file_extent(struct btrfs_path *path, } /* + * Cleanup the dirty folios which will never be submitted due to error. + * + * When running a delalloc range, we may need to split the ranges (due to + * fragmentation or NOCOW). If we hit an error in the later part, we will error + * out and previously successfully executed range will never be submitted, thus + * we have to cleanup those folios by clearing their dirty flag, starting and + * finishing the writeback. + */ +static void cleanup_dirty_folios(struct btrfs_inode *inode, + struct folio *locked_folio, + u64 start, u64 end, int error) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct address_space *mapping = inode->vfs_inode.i_mapping; + pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t end_index = end >> PAGE_SHIFT; + u32 len; + + ASSERT(end + 1 - start < U32_MAX); + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(end + 1, fs_info->sectorsize)); + len = end + 1 - start; + + /* + * Handle the locked folio first. + * The btrfs_folio_clamp_*() helpers can handle range out of the folio case. + */ + btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); + + for (pgoff_t index = start_index; index <= end_index; index++) { + struct folio *folio; + + /* Already handled at the beginning. */ + if (index == locked_folio->index) + continue; + folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS); + /* Cache already dropped, no need to do any cleanup. */ + if (IS_ERR(folio)) + continue; + btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); + folio_unlock(folio); + folio_put(folio); + } + mapping_set_error(mapping, error); +} + +static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio, + struct extent_state **cached, + struct can_nocow_file_extent_args *nocow_args, + u64 file_pos, bool is_prealloc) +{ + struct btrfs_ordered_extent *ordered; + u64 len = nocow_args->file_extent.num_bytes; + u64 end = file_pos + len - 1; + int ret = 0; + + btrfs_lock_extent(&inode->io_tree, file_pos, end, cached); + + if (is_prealloc) { + struct extent_map *em; + + em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent, + BTRFS_ORDERED_PREALLOC); + if (IS_ERR(em)) { + btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached); + return PTR_ERR(em); + } + btrfs_free_extent_map(em); + } + + ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent, + is_prealloc + ? (1U << BTRFS_ORDERED_PREALLOC) + : (1U << BTRFS_ORDERED_NOCOW)); + if (IS_ERR(ordered)) { + if (is_prealloc) + btrfs_drop_extent_map_range(inode, file_pos, end, false); + btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached); + return PTR_ERR(ordered); + } + + if (btrfs_is_data_reloc_root(inode->root)) + /* + * Errors are handled later, as we must prevent + * extent_clear_unlock_delalloc() in error handler from freeing + * metadata of the created ordered extent. + */ + ret = btrfs_reloc_clone_csums(ordered); + btrfs_put_ordered_extent(ordered); + + extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_CLEAR_DATA_RESV, + PAGE_UNLOCK | PAGE_SET_ORDERED); + /* + * On error, we need to cleanup the ordered extents we created. + * + * We do not clear the folio Dirty flags because they are set and + * cleaered by the caller. + */ + if (ret < 0) + btrfs_cleanup_ordered_extents(inode, file_pos, end); + return ret; +} + +/* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. * @@ -1985,6 +2032,11 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, struct btrfs_root *root = inode->root; struct btrfs_path *path; u64 cow_start = (u64)-1; + /* + * If not 0, represents the inclusive end of the last fallback_to_cow() + * range. Only for error handling. + */ + u64 cow_end = 0; u64 cur_offset = start; int ret; bool check_prev = true; @@ -2009,15 +2061,12 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, while (cur_offset <= end) { struct btrfs_block_group *nocow_bg = NULL; - struct btrfs_ordered_extent *ordered; struct btrfs_key found_key; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; struct extent_state *cached_state = NULL; u64 extent_end; - u64 nocow_end; int extent_type; - bool is_prealloc; ret = btrfs_lookup_file_extent(NULL, root, path, ino, cur_offset, 0); @@ -2072,12 +2121,13 @@ next_slot: /* * If the found extent starts after requested offset, then - * adjust extent_end to be right before this extent begins + * adjust cur_offset to be right before this extent begins. */ if (found_key.offset > cur_offset) { - extent_end = found_key.offset; - extent_type = 0; - goto must_cow; + if (cow_start == (u64)-1) + cow_start = cur_offset; + cur_offset = found_key.offset; + goto next_slot; } /* @@ -2143,74 +2193,21 @@ must_cow: if (cow_start != (u64)-1) { ret = fallback_to_cow(inode, locked_folio, cow_start, found_key.offset - 1); - cow_start = (u64)-1; if (ret) { + cow_end = found_key.offset - 1; btrfs_dec_nocow_writers(nocow_bg); goto error; } + cow_start = (u64)-1; } - nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1; - lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state); - - is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC; - if (is_prealloc) { - struct extent_map *em; - - em = btrfs_create_io_em(inode, cur_offset, - &nocow_args.file_extent, - BTRFS_ORDERED_PREALLOC); - if (IS_ERR(em)) { - unlock_extent(&inode->io_tree, cur_offset, - nocow_end, &cached_state); - btrfs_dec_nocow_writers(nocow_bg); - ret = PTR_ERR(em); - goto error; - } - free_extent_map(em); - } - - ordered = btrfs_alloc_ordered_extent(inode, cur_offset, - &nocow_args.file_extent, - is_prealloc - ? (1 << BTRFS_ORDERED_PREALLOC) - : (1 << BTRFS_ORDERED_NOCOW)); + ret = nocow_one_range(inode, locked_folio, &cached_state, + &nocow_args, cur_offset, + extent_type == BTRFS_FILE_EXTENT_PREALLOC); btrfs_dec_nocow_writers(nocow_bg); - if (IS_ERR(ordered)) { - if (is_prealloc) { - btrfs_drop_extent_map_range(inode, cur_offset, - nocow_end, false); - } - unlock_extent(&inode->io_tree, cur_offset, - nocow_end, &cached_state); - ret = PTR_ERR(ordered); + if (ret < 0) goto error; - } - - if (btrfs_is_data_reloc_root(root)) - /* - * Error handled later, as we must prevent - * extent_clear_unlock_delalloc() in error handler - * from freeing metadata of created ordered extent. - */ - ret = btrfs_reloc_clone_csums(ordered); - btrfs_put_ordered_extent(ordered); - - extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, - locked_folio, &cached_state, - EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_CLEAR_DATA_RESV, - PAGE_UNLOCK | PAGE_SET_ORDERED); - cur_offset = extent_end; - - /* - * btrfs_reloc_clone_csums() error, now we're OK to call error - * handler, as metadata for created ordered extent will only - * be freed by btrfs_finish_ordered_io(). - */ - if (ret) - goto error; } btrfs_release_path(path); @@ -2218,11 +2215,12 @@ must_cow: cow_start = cur_offset; if (cow_start != (u64)-1) { - cur_offset = end; ret = fallback_to_cow(inode, locked_folio, cow_start, end); - cow_start = (u64)-1; - if (ret) + if (ret) { + cow_end = end; goto error; + } + cow_start = (u64)-1; } btrfs_free_path(path); @@ -2230,13 +2228,59 @@ must_cow: error: /* - * If an error happened while a COW region is outstanding, cur_offset - * needs to be reset to cow_start to ensure the COW region is unlocked - * as well. + * There are several error cases: + * + * 1) Failed without falling back to COW + * start cur_offset end + * |/////////////| | + * + * In this case, cow_start should be (u64)-1. + * + * For range [start, cur_offset) the folios are already unlocked (except + * @locked_folio), EXTENT_DELALLOC already removed. + * Need to clear the dirty flags and finish the ordered extents. + * + * 2) Failed with error before calling fallback_to_cow() + * + * start cow_start end + * |/////////////| | + * + * In this case, only @cow_start is set, @cur_offset is between + * [cow_start, end) + * + * It's mostly the same as case 1), just replace @cur_offset with + * @cow_start. + * + * 3) Failed with error from fallback_to_cow() + * + * start cow_start cow_end end + * |/////////////|-----------| | + * + * In this case, both @cow_start and @cow_end is set. + * + * For range [start, cow_start) it's the same as case 1). + * But for range [cow_start, cow_end), all the cleanup is handled by + * cow_file_range(), we should not touch anything in that range. + * + * So for all above cases, if @cow_start is set, cleanup ordered extents + * for range [start, @cow_start), other wise cleanup range [start, @cur_offset). */ if (cow_start != (u64)-1) cur_offset = cow_start; + if (cur_offset > start) { + btrfs_cleanup_ordered_extents(inode, start, cur_offset - start); + cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret); + } + + /* + * If an error happened while a COW region is outstanding, cur_offset + * needs to be reset to @cow_end + 1 to skip the COW range, as + * cow_file_range() will do the proper cleanup at error. + */ + if (cow_end) + cur_offset = cow_end + 1; + /* * We need to lock the extent here because we're clearing DELALLOC and * we're not locked at this point. @@ -2244,7 +2288,7 @@ error: if (cur_offset < end) { struct extent_state *cached = NULL; - lock_extent(&inode->io_tree, cur_offset, end, &cached); + btrfs_lock_extent(&inode->io_tree, cur_offset, end, &cached); extent_clear_unlock_delalloc(inode, cur_offset, end, locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC | @@ -2255,6 +2299,10 @@ error: btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL); } btrfs_free_path(path); + btrfs_err_rl(fs_info, + "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), start, end + 1 - start, ret); return ret; } @@ -2262,7 +2310,7 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) { if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) { if (inode->defrag_bytes && - test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG)) + btrfs_test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG)) return false; return true; } @@ -2283,12 +2331,11 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol * The range must cover part of the @locked_folio, or a return of 1 * can confuse the caller. */ - ASSERT(!(end <= folio_pos(locked_folio) || - start >= folio_pos(locked_folio) + folio_size(locked_folio))); + ASSERT(!(end <= folio_pos(locked_folio) || start >= folio_end(locked_folio))); if (should_nocow(inode, start, end)) { ret = run_delalloc_nocow(inode, locked_folio, start, end); - goto out; + return ret; } if (btrfs_inode_can_compress(inode) && @@ -2302,11 +2349,6 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol else ret = cow_file_range(inode, locked_folio, start, end, NULL, false, false); - -out: - if (ret < 0) - btrfs_cleanup_ordered_extents(inode, locked_folio, start, - end - start + 1); return ret; } @@ -2556,7 +2598,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, !btrfs_is_free_space_inode(inode) && !(state->state & EXTENT_NORESERVE) && (bits & EXTENT_CLEAR_DATA_RESV)) - btrfs_free_reserved_data_space_noquota(fs_info, len); + btrfs_free_reserved_data_space_noquota(inode, len); percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, fs_info->delalloc_batch); @@ -2640,12 +2682,12 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, if (em_len > search_len) em_len = search_len; - ret = set_extent_bit(&inode->io_tree, search_start, - search_start + em_len - 1, - EXTENT_DELALLOC_NEW, cached_state); + ret = btrfs_set_extent_bit(&inode->io_tree, search_start, + search_start + em_len - 1, + EXTENT_DELALLOC_NEW, cached_state); next: - search_start = extent_map_end(em); - free_extent_map(em); + search_start = btrfs_extent_map_end(em); + btrfs_free_extent_map(em); if (ret) return ret; } @@ -2675,8 +2717,8 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, return ret; } - return set_extent_bit(&inode->io_tree, start, end, - EXTENT_DELALLOC | extra_bits, cached_state); + return btrfs_set_extent_bit(&inode->io_tree, start, end, + EXTENT_DELALLOC | extra_bits, cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ @@ -2697,7 +2739,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct btrfs_inode *inode = fixup->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 page_start = folio_pos(folio); - u64 page_end = folio_pos(folio) + folio_size(folio) - 1; + u64 page_end = folio_end(folio) - 1; int ret = 0; bool free_delalloc_space = true; @@ -2751,7 +2793,7 @@ again: if (ret) goto out_page; - lock_extent(&inode->io_tree, page_start, page_end, &cached_state); + btrfs_lock_extent(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ if (folio_test_ordered(folio)) @@ -2759,8 +2801,8 @@ again: ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); if (ordered) { - unlock_extent(&inode->io_tree, page_start, page_end, - &cached_state); + btrfs_unlock_extent(&inode->io_tree, page_start, page_end, + &cached_state); folio_unlock(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); @@ -2786,7 +2828,7 @@ out_reserved: if (free_delalloc_space) btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE, true); - unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); out_page: if (ret) { /* @@ -2833,6 +2875,21 @@ int btrfs_writepage_cow_fixup(struct folio *folio) return 0; /* + * For experimental build, we error out instead of EAGAIN. + * + * We should not hit such out-of-band dirty folios anymore. + */ + if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) { + DEBUG_WARN(); + btrfs_err_rl(fs_info, + "root %lld ino %llu folio %llu is marked dirty without notifying the fs", + btrfs_root_id(BTRFS_I(inode)->root), + btrfs_ino(BTRFS_I(inode)), + folio_pos(folio)); + return -EUCLEAN; + } + + /* * folio_checked is set below when we create a fixup worker for this * folio, don't try to create another one if we're already * folio_test_checked. @@ -2851,7 +2908,7 @@ int btrfs_writepage_cow_fixup(struct folio *folio) * We are already holding a reference to this inode from * write_cache_pages. We need to hold it because the space reservation * takes place outside of the folio lock, and we can't trust - * page->mapping outside of the folio lock. + * folio->mapping outside of the folio lock. */ ihold(inode); btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); @@ -2872,7 +2929,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; const u64 sectorsize = root->fs_info->sectorsize; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key ins; u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi); @@ -2907,8 +2964,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, if (!drop_args.extent_inserted) { ins.objectid = btrfs_ino(inode); - ins.offset = file_pos; ins.type = BTRFS_EXTENT_DATA_KEY; + ins.offset = file_pos; ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*stack_fi)); @@ -2921,7 +2978,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(struct btrfs_file_extent_item)); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* @@ -2944,8 +3000,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found); ins.objectid = disk_bytenr; - ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = disk_num_bytes; ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes); if (ret) @@ -2955,8 +3011,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, file_pos - offset, qgroup_reserved, &ins); out: - btrfs_free_path(path); - return ret; } @@ -3063,6 +3117,21 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } + /* + * If it's a COW write we need to lock the extent range as we will be + * inserting/replacing file extent items and unpinning an extent map. + * This must be taken before joining a transaction, as it's a higher + * level lock (like the inode's VFS lock), otherwise we can run into an + * ABBA deadlock with other tasks (transactions work like a lock, + * depending on their current state). + */ + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { + clear_bits |= EXTENT_LOCKED | EXTENT_FINISHING_ORDERED; + btrfs_lock_extent_bits(io_tree, start, end, + EXTENT_LOCKED | EXTENT_FINISHING_ORDERED, + &cached_state); + } + if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); else @@ -3099,9 +3168,6 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } - clear_bits |= EXTENT_LOCKED; - lock_extent(io_tree, start, end, &cached_state); - if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { @@ -3127,8 +3193,8 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } - ret = unpin_extent_cache(inode, ordered_extent->file_offset, - ordered_extent->num_bytes, trans->transid); + ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset, + ordered_extent->num_bytes, trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; @@ -3147,9 +3213,9 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) */ if ((clear_bits & EXTENT_DELALLOC_NEW) && !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) - clear_extent_bit(&inode->io_tree, start, end, - EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, - &cached_state); + btrfs_clear_extent_bit(&inode->io_tree, start, end, + EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, + &cached_state); btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, inode); @@ -3158,15 +3224,13 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } out: - clear_extent_bit(&inode->io_tree, start, end, clear_bits, - &cached_state); + btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, + &cached_state); if (trans) btrfs_end_transaction(trans); if (ret || truncated) { - u64 unwritten_start = start; - /* * If we failed to finish this ordered extent for any reason we * need to make sure BTRFS_ORDERED_IOERR is set on the ordered @@ -3178,10 +3242,6 @@ out: if (ret) btrfs_mark_ordered_extent_error(ordered_extent); - if (truncated) - unwritten_start += logical_len; - clear_extent_uptodate(io_tree, unwritten_start, end, NULL); - /* * Drop extent maps for the part of the extent we didn't write. * @@ -3196,9 +3256,15 @@ out: * we don't mess with the extent map tree in the NOCOW case, but * for now simply skip this if we are the free space inode. */ - if (!btrfs_is_free_space_inode(inode)) + if (!btrfs_is_free_space_inode(inode)) { + u64 unwritten_start = start; + + if (truncated) + unwritten_start += logical_len; + btrfs_drop_extent_map_range(inode, unwritten_start, end, false); + } /* * If the ordered extent had an IOERR or something else went @@ -3225,7 +3291,7 @@ out: NULL); btrfs_free_reserved_extent(fs_info, ordered_extent->disk_bytenr, - ordered_extent->disk_num_bytes, 1); + ordered_extent->disk_num_bytes, true); /* * Actually free the qgroup rsv which was released when * the ordered extent was created. @@ -3262,20 +3328,16 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) /* * Verify the checksum for a single sector without any extra action that depend * on the type of I/O. + * + * @kaddr must be a properly kmapped address. */ -int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, - u32 pgoff, u8 *csum, const u8 * const csum_expected) +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum, + const u8 * const csum_expected) { SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - char *kaddr; - - ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE); shash->tfm = fs_info->csum_shash; - - kaddr = kmap_local_page(page) + pgoff; crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); - kunmap_local(kaddr); if (memcmp(csum, csum_expected, fs_info->csum_size)) return -EIO; @@ -3304,6 +3366,7 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u64 end = file_offset + bv->bv_len - 1; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; + void *kaddr; ASSERT(bv->bv_len == fs_info->sectorsize); @@ -3311,19 +3374,22 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, return true; if (btrfs_is_data_reloc_root(inode->root) && - test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, - NULL)) { + btrfs_test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, + NULL)) { /* Skip the range without csum for data reloc inode */ - clear_extent_bits(&inode->io_tree, file_offset, end, - EXTENT_NODATASUM); + btrfs_clear_extent_bit(&inode->io_tree, file_offset, end, + EXTENT_NODATASUM, NULL); return true; } csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) * fs_info->csum_size; - if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum, - csum_expected)) + kaddr = bvec_kmap_local(bv); + if (btrfs_check_sector_csum(fs_info, kaddr, csum, csum_expected)) { + kunmap_local(kaddr); goto zeroit; + } + kunmap_local(kaddr); return true; zeroit: @@ -3353,6 +3419,7 @@ void btrfs_add_delayed_iput(struct btrfs_inode *inode) if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1)) return; + WARN_ON_ONCE(test_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state)); atomic_inc(&fs_info->nr_delayed_iputs); /* * Need to be irq safe here because we can be called from either an irq @@ -3469,11 +3536,10 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans, int btrfs_orphan_cleanup(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key key, found_key; struct btrfs_trans_handle *trans; - struct inode *inode; u64 last_objectid = 0; int ret = 0, nr_unlink = 0; @@ -3492,6 +3558,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) key.offset = (u64)-1; while (1) { + struct btrfs_inode *inode; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; @@ -3615,10 +3683,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * deleted but wasn't. The inode number may have been reused, * but either way, we can delete the orphan item. */ - if (!inode || inode->i_nlink) { + if (!inode || inode->vfs_inode.i_nlink) { if (inode) { - ret = btrfs_drop_verity_items(BTRFS_I(inode)); - iput(inode); + ret = btrfs_drop_verity_items(inode); + iput(&inode->vfs_inode); inode = NULL; if (ret) goto out; @@ -3641,7 +3709,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) nr_unlink++; /* this will do delete_inode and everything for us */ - iput(inode); + iput(&inode->vfs_inode); } /* release the path since we're done with it */ btrfs_release_path(path); @@ -3658,19 +3726,22 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) out: if (ret) btrfs_err(fs_info, "could not do orphan cleanup %d", ret); - btrfs_free_path(path); return ret; } /* - * very simple check to peek ahead in the leaf looking for xattrs. If we - * don't find any xattrs, we know there can't be any acls. + * Look ahead in the leaf for xattrs. If we don't find any then we know there + * can't be any ACLs. * - * slot is the slot the inode is in, objectid is the objectid of the inode + * @leaf: the eb leaf where to search + * @slot: the slot the inode is in + * @objectid: the objectid of the inode + * + * Return true if there is xattr/ACL, false otherwise. */ -static noinline int acls_after_inode_item(struct extent_buffer *leaf, - int slot, u64 objectid, - int *first_xattr_slot) +static noinline bool acls_after_inode_item(struct extent_buffer *leaf, + int slot, u64 objectid, + int *first_xattr_slot) { u32 nritems = btrfs_header_nritems(leaf); struct btrfs_key found_key; @@ -3690,45 +3761,50 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, while (slot < nritems) { btrfs_item_key_to_cpu(leaf, &found_key, slot); - /* we found a different objectid, there must not be acls */ + /* We found a different objectid, there must be no ACLs. */ if (found_key.objectid != objectid) - return 0; + return false; - /* we found an xattr, assume we've got an acl */ + /* We found an xattr, assume we've got an ACL. */ if (found_key.type == BTRFS_XATTR_ITEM_KEY) { if (*first_xattr_slot == -1) *first_xattr_slot = slot; if (found_key.offset == xattr_access || found_key.offset == xattr_default) - return 1; + return true; } /* - * we found a key greater than an xattr key, there can't - * be any acls later on + * We found a key greater than an xattr key, there can't be any + * ACLs later on. */ if (found_key.type > BTRFS_XATTR_ITEM_KEY) - return 0; + return false; slot++; scanned++; /* - * it goes inode, inode backrefs, xattrs, extents, - * so if there are a ton of hard links to an inode there can - * be a lot of backrefs. Don't waste time searching too hard, - * this is just an optimization + * The item order goes like: + * - inode + * - inode backrefs + * - xattrs + * - extents, + * + * so if there are lots of hard links to an inode there can be + * a lot of backrefs. Don't waste time searching too hard, + * this is just an optimization. */ if (scanned >= 8) break; } - /* we hit the end of the leaf before we found an xattr or - * something larger than an xattr. We have to assume the inode - * has acls + /* + * We hit the end of the leaf before we found an xattr or something + * larger than an xattr. We have to assume the inode has ACLs. */ if (*first_xattr_slot == -1) *first_xattr_slot = slot; - return 1; + return true; } static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) @@ -3748,7 +3824,8 @@ static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) if (!inode->file_extent_tree) return -ENOMEM; - extent_io_tree_init(fs_info, inode->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); + btrfs_extent_io_tree_init(fs_info, inode->file_extent_tree, + IO_TREE_INODE_FILE_EXTENT); /* Lockdep class is set only for the file extent tree. */ lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class); @@ -3791,12 +3868,13 @@ static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) * * On failure clean up the inode. */ -static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) +static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path *path) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct inode *vfs_inode = &inode->vfs_inode; struct btrfs_key location; unsigned long ptr; int maybe_acls; @@ -3805,7 +3883,7 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) bool filled = false; int first_xattr_slot; - ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); + ret = btrfs_init_file_extent_tree(inode); if (ret) goto out; @@ -3815,7 +3893,7 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) ASSERT(path); - btrfs_get_inode_key(BTRFS_I(inode), &location); + btrfs_get_inode_key(inode, &location); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { @@ -3835,41 +3913,42 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - inode->i_mode = btrfs_inode_mode(leaf, inode_item); - set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); - i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); - i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); - btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); - btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, - round_up(i_size_read(inode), fs_info->sectorsize)); - - inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime), + vfs_inode->i_mode = btrfs_inode_mode(leaf, inode_item); + set_nlink(vfs_inode, btrfs_inode_nlink(leaf, inode_item)); + i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item)); + i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item)); + btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); + btrfs_inode_set_file_extent_range(inode, 0, + round_up(i_size_read(vfs_inode), fs_info->sectorsize)); + + inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime), btrfs_timespec_nsec(leaf, &inode_item->atime)); - inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime), + inode_set_mtime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->mtime), btrfs_timespec_nsec(leaf, &inode_item->mtime)); - inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime), + inode_set_ctime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->ctime), btrfs_timespec_nsec(leaf, &inode_item->ctime)); - BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); - BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); + inode->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); + inode->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); - inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); - BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); - BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); + inode_set_bytes(vfs_inode, btrfs_inode_nbytes(leaf, inode_item)); + inode->generation = btrfs_inode_generation(leaf, inode_item); + inode->last_trans = btrfs_inode_transid(leaf, inode_item); - inode_set_iversion_queried(inode, - btrfs_inode_sequence(leaf, inode_item)); - inode->i_generation = BTRFS_I(inode)->generation; - inode->i_rdev = 0; + inode_set_iversion_queried(vfs_inode, btrfs_inode_sequence(leaf, inode_item)); + vfs_inode->i_generation = inode->generation; + vfs_inode->i_rdev = 0; rdev = btrfs_inode_rdev(leaf, inode_item); - if (S_ISDIR(inode->i_mode)) - BTRFS_I(inode)->index_cnt = (u64)-1; + if (S_ISDIR(vfs_inode->i_mode)) + inode->index_cnt = (u64)-1; btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), - &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); + &inode->flags, &inode->ro_flags); + btrfs_update_inode_mapping_flags(inode); + btrfs_set_inode_mapping_order(inode); cache_index: /* @@ -3881,9 +3960,8 @@ cache_index: * This is required for both inode re-read from disk and delayed inode * in the delayed_nodes xarray. */ - if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info)) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + if (inode->last_trans == btrfs_get_fs_generation(fs_info)) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); /* * We don't persist the id of the transaction where an unlink operation @@ -3912,7 +3990,7 @@ cache_index: * transaction commits on fsync if our inode is a directory, or if our * inode is not a directory, logging its parent unnecessarily. */ - BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + inode->last_unlink_trans = inode->last_trans; /* * Same logic as for last_unlink_trans. We don't persist the generation @@ -3920,15 +3998,15 @@ cache_index: * operation, so after eviction and reloading the inode we must be * pessimistic and assume the last transaction that modified the inode. */ - BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans; + inode->last_reflink_trans = inode->last_trans; path->slots[0]++; - if (inode->i_nlink != 1 || + if (vfs_inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf)) goto cache_acl; btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); - if (location.objectid != btrfs_ino(BTRFS_I(inode))) + if (location.objectid != btrfs_ino(inode)) goto cache_acl; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); @@ -3936,13 +4014,12 @@ cache_index: struct btrfs_inode_ref *ref; ref = (struct btrfs_inode_ref *)ptr; - BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); + inode->dir_index = btrfs_inode_ref_index(leaf, ref); } else if (location.type == BTRFS_INODE_EXTREF_KEY) { struct btrfs_inode_extref *extref; extref = (struct btrfs_inode_extref *)ptr; - BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, - extref); + inode->dir_index = btrfs_inode_extref_index(leaf, extref); } cache_acl: /* @@ -3950,50 +4027,49 @@ cache_acl: * any xattrs or acls */ maybe_acls = acls_after_inode_item(leaf, path->slots[0], - btrfs_ino(BTRFS_I(inode)), &first_xattr_slot); + btrfs_ino(inode), &first_xattr_slot); if (first_xattr_slot != -1) { path->slots[0] = first_xattr_slot; ret = btrfs_load_inode_props(inode, path); if (ret) btrfs_err(fs_info, "error loading props for ino %llu (root %llu): %d", - btrfs_ino(BTRFS_I(inode)), - btrfs_root_id(root), ret); + btrfs_ino(inode), btrfs_root_id(root), ret); } if (!maybe_acls) - cache_no_acl(inode); + cache_no_acl(vfs_inode); - switch (inode->i_mode & S_IFMT) { + switch (vfs_inode->i_mode & S_IFMT) { case S_IFREG: - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; + vfs_inode->i_mapping->a_ops = &btrfs_aops; + vfs_inode->i_fop = &btrfs_file_operations; + vfs_inode->i_op = &btrfs_file_inode_operations; break; case S_IFDIR: - inode->i_fop = &btrfs_dir_file_operations; - inode->i_op = &btrfs_dir_inode_operations; + vfs_inode->i_fop = &btrfs_dir_file_operations; + vfs_inode->i_op = &btrfs_dir_inode_operations; break; case S_IFLNK: - inode->i_op = &btrfs_symlink_inode_operations; - inode_nohighmem(inode); - inode->i_mapping->a_ops = &btrfs_aops; + vfs_inode->i_op = &btrfs_symlink_inode_operations; + inode_nohighmem(vfs_inode); + vfs_inode->i_mapping->a_ops = &btrfs_aops; break; default: - inode->i_op = &btrfs_special_inode_operations; - init_special_inode(inode, inode->i_mode, rdev); + vfs_inode->i_op = &btrfs_special_inode_operations; + init_special_inode(vfs_inode, vfs_inode->i_mode, rdev); break; } btrfs_sync_inode_flags_to_i_flags(inode); - ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); + ret = btrfs_add_inode_to_root(inode, true); if (ret) goto out; return 0; out: - iget_failed(inode); + iget_failed(vfs_inode); return ret; } @@ -4005,45 +4081,35 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *item, struct inode *inode) { - struct btrfs_map_token token; u64 flags; - btrfs_init_map_token(&token, leaf); - - btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); - btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); - btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size); - btrfs_set_token_inode_mode(&token, item, inode->i_mode); - btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); - - btrfs_set_token_timespec_sec(&token, &item->atime, - inode_get_atime_sec(inode)); - btrfs_set_token_timespec_nsec(&token, &item->atime, - inode_get_atime_nsec(inode)); - - btrfs_set_token_timespec_sec(&token, &item->mtime, - inode_get_mtime_sec(inode)); - btrfs_set_token_timespec_nsec(&token, &item->mtime, - inode_get_mtime_nsec(inode)); - - btrfs_set_token_timespec_sec(&token, &item->ctime, - inode_get_ctime_sec(inode)); - btrfs_set_token_timespec_nsec(&token, &item->ctime, - inode_get_ctime_nsec(inode)); - - btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec); - btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec); - - btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); - btrfs_set_token_inode_generation(&token, item, - BTRFS_I(inode)->generation); - btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); - btrfs_set_token_inode_transid(&token, item, trans->transid); - btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); + btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); + btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); + btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); + btrfs_set_inode_mode(leaf, item, inode->i_mode); + btrfs_set_inode_nlink(leaf, item, inode->i_nlink); + + btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode)); + btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode)); + + btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode)); + btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode)); + + btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode)); + btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode)); + + btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec); + btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec); + + btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); + btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); + btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode)); + btrfs_set_inode_transid(leaf, item, trans->transid); + btrfs_set_inode_rdev(leaf, item, inode->i_rdev); flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, BTRFS_I(inode)->ro_flags); - btrfs_set_token_inode_flags(&token, item, flags); - btrfs_set_token_inode_block_group(&token, item, 0); + btrfs_set_inode_flags(leaf, item, flags); + btrfs_set_inode_block_group(leaf, item, 0); } /* @@ -4053,7 +4119,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { struct btrfs_inode_item *inode_item; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key key; int ret; @@ -4067,7 +4133,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, if (ret) { if (ret > 0) ret = -ENOENT; - goto failed; + return ret; } leaf = path->nodes[0]; @@ -4075,12 +4141,8 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item); fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_set_inode_last_trans(trans, inode); - ret = 0; -failed: - btrfs_free_path(path); - return ret; + return 0; } /* @@ -4146,20 +4208,22 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, u64 dir_ino = btrfs_ino(dir); path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1); if (IS_ERR_OR_NULL(di)) { - ret = di ? PTR_ERR(di) : -ENOENT; - goto err; + btrfs_free_path(path); + return di ? PTR_ERR(di) : -ENOENT; } ret = btrfs_delete_one_dir_name(trans, root, path, di); + /* + * Down the call chains below we'll also need to allocate a path, so no + * need to hold on to this one for longer than necessary. + */ + btrfs_free_path(path); if (ret) - goto err; - btrfs_release_path(path); + return ret; /* * If we don't have dir index, we have to get it by looking up @@ -4181,11 +4245,11 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index); if (ret) { - btrfs_info(fs_info, - "failed to delete reference to %.*s, inode %llu parent %llu", - name->len, name->name, ino, dir_ino); + btrfs_crit(fs_info, + "failed to delete reference to %.*s, root %llu inode %llu parent %llu", + name->len, name->name, btrfs_root_id(root), ino, dir_ino); btrfs_abort_transaction(trans, ret); - goto err; + return ret; } skip_backref: if (rename_ctx) @@ -4194,7 +4258,7 @@ skip_backref: ret = btrfs_delete_delayed_dir_index(trans, dir, index); if (ret) { btrfs_abort_transaction(trans, ret); - goto err; + return ret; } /* @@ -4218,19 +4282,14 @@ skip_backref: * holding. */ btrfs_run_delayed_iput(fs_info, inode); -err: - btrfs_free_path(path); - if (ret) - goto out; btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2); inode_inc_iversion(&inode->vfs_inode); inode_set_ctime_current(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); - ret = btrfs_update_inode(trans, dir); -out: - return ret; + + return btrfs_update_inode(trans, dir); } int btrfs_unlink_inode(struct btrfs_trans_handle *trans, @@ -4414,7 +4473,7 @@ out: static noinline int may_destroy_subvol(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *di; struct btrfs_key key; struct fscrypt_str name = FSTR_INIT("default", 7); @@ -4436,7 +4495,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) btrfs_err(fs_info, "deleting default subvolume %llu is not allowed", key.objectid); - goto out; + return ret; } btrfs_release_path(path); } @@ -4447,14 +4506,13 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. */ - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } ret = 0; @@ -4464,8 +4522,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY) ret = -ENOTEMPTY; } -out: - btrfs_free_path(path); + return ret; } @@ -4637,68 +4694,68 @@ out_up_write: return ret; } -static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) +static int btrfs_rmdir(struct inode *vfs_dir, struct dentry *dentry) { - struct inode *inode = d_inode(dentry); - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_inode *dir = BTRFS_I(vfs_dir); + struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); + struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; struct btrfs_trans_handle *trans; - u64 last_unlink_trans; struct fscrypt_name fname; - if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) + if (inode->vfs_inode.i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; - if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) { + if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) { btrfs_err(fs_info, "extent tree v2 doesn't support snapshot deletion yet"); return -EOPNOTSUPP; } - return btrfs_delete_subvolume(BTRFS_I(dir), dentry); + return btrfs_delete_subvolume(dir, dentry); } - ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + ret = fscrypt_setup_filename(vfs_dir, &dentry->d_name, 1, &fname); if (ret) return ret; /* This needs to handle no-key deletions later on */ - trans = __unlink_start_trans(BTRFS_I(dir)); + trans = __unlink_start_trans(dir); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_notrans; } - if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); + /* + * Propagate the last_unlink_trans value of the deleted dir to its + * parent directory. This is to prevent an unrecoverable log tree in the + * case we do something like this: + * 1) create dir foo + * 2) create snapshot under dir foo + * 3) delete the snapshot + * 4) rmdir foo + * 5) mkdir foo + * 6) fsync foo or some file inside foo + * + * This is because we can't unlink other roots when replaying the dir + * deletes for directory foo. + */ + if (inode->last_unlink_trans >= trans->transid) + btrfs_record_snapshot_destroy(trans, dir); + + if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { + ret = btrfs_unlink_subvol(trans, dir, dentry); goto out; } - ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + ret = btrfs_orphan_add(trans, inode); if (ret) goto out; - last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; - /* now the directory is empty */ - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), - &fname.disk_name); - if (!ret) { - btrfs_i_size_write(BTRFS_I(inode), 0); - /* - * Propagate the last_unlink_trans value of the deleted dir to - * its parent directory. This is to prevent an unrecoverable - * log tree in the case we do something like this: - * 1) create dir foo - * 2) create snapshot under dir foo - * 3) delete the snapshot - * 4) rmdir foo - * 5) mkdir foo - * 6) fsync foo or some file inside foo - */ - if (last_unlink_trans >= trans->transid) - BTRFS_I(dir)->last_unlink_trans = last_unlink_trans; - } + ret = btrfs_unlink_inode(trans, dir, inode, &fname.disk_name); + if (!ret) + btrfs_i_size_write(inode, 0); out: btrfs_end_transaction(trans); out_notrans: @@ -4708,20 +4765,80 @@ out_notrans: return ret; } +static bool is_inside_block(u64 bytenr, u64 blockstart, u32 blocksize) +{ + ASSERT(IS_ALIGNED(blockstart, blocksize), "blockstart=%llu blocksize=%u", + blockstart, blocksize); + + if (blockstart <= bytenr && bytenr <= blockstart + blocksize - 1) + return true; + return false; +} + +static int truncate_block_zero_beyond_eof(struct btrfs_inode *inode, u64 start) +{ + const pgoff_t index = (start >> PAGE_SHIFT); + struct address_space *mapping = inode->vfs_inode.i_mapping; + struct folio *folio; + u64 zero_start; + u64 zero_end; + int ret = 0; + +again: + folio = filemap_lock_folio(mapping, index); + /* No folio present. */ + if (IS_ERR(folio)) + return 0; + + if (!folio_test_uptodate(folio)) { + ret = btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (folio->mapping != mapping) { + folio_unlock(folio); + folio_put(folio); + goto again; + } + if (!folio_test_uptodate(folio)) { + ret = -EIO; + goto out_unlock; + } + } + folio_wait_writeback(folio); + + /* + * We do not need to lock extents nor wait for OE, as it's already + * beyond EOF. + */ + + zero_start = max_t(u64, folio_pos(folio), start); + zero_end = folio_end(folio); + folio_zero_range(folio, zero_start - folio_pos(folio), + zero_end - zero_start); + +out_unlock: + folio_unlock(folio); + folio_put(folio); + return ret; +} + /* - * Read, zero a chunk and write a block. + * Handle the truncation of a fs block. * - * @inode - inode that we're zeroing - * @from - the offset to start zeroing - * @len - the length to zero, 0 to zero the entire range respective to the - * offset - * @front - zero up to the offset instead of from the offset on + * @inode - inode that we're zeroing + * @offset - the file offset of the block to truncate + * The value must be inside [@start, @end], and the function will do + * extra checks if the block that covers @offset needs to be zeroed. + * @start - the start file offset of the range we want to zero + * @end - the end (inclusive) file offset of the range we want to zero. * - * This will find the block for the "from" offset and cow the block and zero the - * part we want to zero. This is used with truncate and hole punching. + * If the range is not block aligned, read out the folio that covers @offset, + * and if needed zero blocks that are inside the folio and covered by [@start, @end). + * If @start or @end + 1 lands inside a block, that block will be marked dirty + * for writeback. + * + * This is utilized by hole punch, zero range, file expansion. */ -int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, - int front) +int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; @@ -4731,27 +4848,66 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, struct extent_changeset *data_reserved = NULL; bool only_release_metadata = false; u32 blocksize = fs_info->sectorsize; - pgoff_t index = from >> PAGE_SHIFT; - unsigned offset = from & (blocksize - 1); + pgoff_t index = (offset >> PAGE_SHIFT); struct folio *folio; gfp_t mask = btrfs_alloc_write_mask(mapping); - size_t write_bytes = blocksize; int ret = 0; + const bool in_head_block = is_inside_block(offset, round_down(start, blocksize), + blocksize); + const bool in_tail_block = is_inside_block(offset, round_down(end, blocksize), + blocksize); + bool need_truncate_head = false; + bool need_truncate_tail = false; + u64 zero_start; + u64 zero_end; u64 block_start; u64 block_end; - if (IS_ALIGNED(offset, blocksize) && - (!len || IS_ALIGNED(len, blocksize))) + /* @offset should be inside the range. */ + ASSERT(start <= offset && offset <= end, "offset=%llu start=%llu end=%llu", + offset, start, end); + + /* The range is aligned at both ends. */ + if (IS_ALIGNED(start, blocksize) && IS_ALIGNED(end + 1, blocksize)) { + /* + * For block size < page size case, we may have polluted blocks + * beyond EOF. So we also need to zero them out. + */ + if (end == (u64)-1 && blocksize < PAGE_SIZE) + ret = truncate_block_zero_beyond_eof(inode, start); goto out; + } - block_start = round_down(from, blocksize); + /* + * @offset may not be inside the head nor tail block. In that case we + * don't need to do anything. + */ + if (!in_head_block && !in_tail_block) + goto out; + + /* + * Skip the truncatioin if the range in the target block is already aligned. + * The seemingly complex check will also handle the same block case. + */ + if (in_head_block && !IS_ALIGNED(start, blocksize)) + need_truncate_head = true; + if (in_tail_block && !IS_ALIGNED(end + 1, blocksize)) + need_truncate_tail = true; + if (!need_truncate_head && !need_truncate_tail) + goto out; + + block_start = round_down(offset, blocksize); block_end = block_start + blocksize - 1; ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, blocksize, false); if (ret < 0) { + size_t write_bytes = blocksize; + if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) { - /* For nocow case, no need to reserve data space */ + /* For nocow case, no need to reserve data space. */ + ASSERT(write_bytes == blocksize, "write_bytes=%zu blocksize=%u", + write_bytes, blocksize); only_release_metadata = true; } else { goto out; @@ -4768,10 +4924,13 @@ again: folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); if (IS_ERR(folio)) { - btrfs_delalloc_release_space(inode, data_reserved, block_start, - blocksize, true); + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, blocksize, true); + else + btrfs_delalloc_release_space(inode, data_reserved, + block_start, blocksize, true); btrfs_delalloc_release_extents(inode, blocksize); - ret = -ENOMEM; + ret = PTR_ERR(folio); goto out; } @@ -4801,11 +4960,11 @@ again: folio_wait_writeback(folio); - lock_extent(io_tree, block_start, block_end, &cached_state); + btrfs_lock_extent(io_tree, block_start, block_end, &cached_state); ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { - unlock_extent(io_tree, block_start, block_end, &cached_state); + btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); folio_unlock(folio); folio_put(folio); btrfs_start_ordered_extent(ordered); @@ -4813,37 +4972,46 @@ again: goto again; } - clear_extent_bit(&inode->io_tree, block_start, block_end, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - &cached_state); + btrfs_clear_extent_bit(&inode->io_tree, block_start, block_end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + &cached_state); ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, &cached_state); if (ret) { - unlock_extent(io_tree, block_start, block_end, &cached_state); + btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); goto out_unlock; } - if (offset != blocksize) { - if (!len) - len = blocksize - offset; - if (front) - folio_zero_range(folio, block_start - folio_pos(folio), - offset); - else - folio_zero_range(folio, - (block_start - folio_pos(folio)) + offset, - len); + if (end == (u64)-1) { + /* + * We're truncating beyond EOF, the remaining blocks normally are + * already holes thus no need to zero again, but it's possible for + * fs block size < page size cases to have memory mapped writes + * to pollute ranges beyond EOF. + * + * In that case although such polluted blocks beyond EOF will + * not reach disk, it still affects our page caches. + */ + zero_start = max_t(u64, folio_pos(folio), start); + zero_end = min_t(u64, folio_end(folio) - 1, end); + } else { + zero_start = max_t(u64, block_start, start); + zero_end = min_t(u64, block_end, end); } + folio_zero_range(folio, zero_start - folio_pos(folio), + zero_end - zero_start + 1); + btrfs_folio_clear_checked(fs_info, folio, block_start, block_end + 1 - block_start); btrfs_folio_set_dirty(fs_info, folio, block_start, block_end + 1 - block_start); - unlock_extent(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) - set_extent_bit(&inode->io_tree, block_start, block_end, - EXTENT_NORESERVE, NULL); + btrfs_set_extent_bit(&inode->io_tree, block_start, block_end, + EXTENT_NORESERVE, &cached_state); + + btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); out_unlock: if (ret) { @@ -4936,7 +5104,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) * rest of the block before we expand the i_size, otherwise we could * expose stale data. */ - ret = btrfs_truncate_block(inode, oldsize, 0, 0); + ret = btrfs_truncate_block(inode, oldsize, oldsize, -1); if (ret) return ret; @@ -4953,7 +5121,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) em = NULL; break; } - last_byte = min(extent_map_end(em), block_end); + last_byte = min(btrfs_extent_map_end(em), block_end); last_byte = ALIGN(last_byte, fs_info->sectorsize); hole_size = last_byte - cur_offset; @@ -4969,7 +5137,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) if (ret) break; - hole_em = alloc_extent_map(); + hole_em = btrfs_alloc_extent_map(); if (!hole_em) { btrfs_drop_extent_map_range(inode, cur_offset, cur_offset + hole_size - 1, @@ -4986,7 +5154,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) hole_em->generation = btrfs_get_fs_generation(fs_info); ret = btrfs_replace_extent_map_range(inode, hole_em, true); - free_extent_map(hole_em); + btrfs_free_extent_map(hole_em); } else { ret = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); @@ -4994,14 +5162,14 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) break; } next: - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; cur_offset = last_byte; if (cur_offset >= block_end) break; } - free_extent_map(em); - unlock_extent(io_tree, hole_start, block_end - 1, &cached_state); + btrfs_free_extent_map(em); + btrfs_unlock_extent(io_tree, hole_start, block_end - 1, &cached_state); return ret; } @@ -5081,7 +5249,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize); if (ret && inode->i_nlink) { - int err; + int ret2; /* * Truncate failed, so fix up the in-memory size. We @@ -5089,9 +5257,9 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * wait for disk_i_size to be stable and then update the * in-memory size to match. */ - err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); - if (err) - return err; + ret2 = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); + if (ret2) + return ret2; i_size_write(inode, BTRFS_I(inode)->disk_i_size); } } @@ -5104,31 +5272,31 @@ static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, { struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; - int err; + int ret; if (btrfs_root_readonly(root)) return -EROFS; - err = setattr_prepare(idmap, dentry, attr); - if (err) - return err; + ret = setattr_prepare(idmap, dentry, attr); + if (ret) + return ret; if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { - err = btrfs_setsize(inode, attr); - if (err) - return err; + ret = btrfs_setsize(inode, attr); + if (ret) + return ret; } if (attr->ia_valid) { setattr_copy(idmap, inode, attr); inode_inc_iversion(inode); - err = btrfs_dirty_inode(BTRFS_I(inode)); + ret = btrfs_dirty_inode(BTRFS_I(inode)); - if (!err && attr->ia_valid & ATTR_MODE) - err = posix_acl_chmod(idmap, dentry, inode->i_mode); + if (!ret && attr->ia_valid & ATTR_MODE) + ret = posix_acl_chmod(idmap, dentry, inode->i_mode); } - return err; + return ret; } /* @@ -5185,7 +5353,7 @@ static void evict_inode_truncate_pages(struct inode *inode) state_flags = state->state; spin_unlock(&io_tree->lock); - lock_extent(io_tree, start, end, &cached_state); + btrfs_lock_extent(io_tree, start, end, &cached_state); /* * If still has DELALLOC flag, the extent didn't reach disk, @@ -5199,9 +5367,9 @@ static void evict_inode_truncate_pages(struct inode *inode) btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, end - start + 1, NULL); - clear_extent_bit(io_tree, start, end, - EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, - &cached_state); + btrfs_clear_extent_bit(io_tree, start, end, + EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, + &cached_state); cond_resched(); spin_lock(&io_tree->lock); @@ -5262,7 +5430,7 @@ void btrfs_evict_inode(struct inode *inode) struct btrfs_fs_info *fs_info; struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_block_rsv *rsv = NULL; + struct btrfs_block_rsv rsv; int ret; trace_btrfs_inode_evict(inode); @@ -5310,11 +5478,9 @@ void btrfs_evict_inode(struct inode *inode) */ btrfs_kill_delayed_inode_items(BTRFS_I(inode)); - rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); - if (!rsv) - goto out; - rsv->size = btrfs_calc_metadata_size(fs_info, 1); - rsv->failfast = true; + btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP); + rsv.size = btrfs_calc_metadata_size(fs_info, 1); + rsv.failfast = true; btrfs_i_size_write(BTRFS_I(inode), 0); @@ -5326,11 +5492,11 @@ void btrfs_evict_inode(struct inode *inode) .min_type = 0, }; - trans = evict_refill_and_join(root, rsv); + trans = evict_refill_and_join(root, &rsv); if (IS_ERR(trans)) - goto out; + goto out_release; - trans->block_rsv = rsv; + trans->block_rsv = &rsv; ret = btrfs_truncate_inode_items(trans, root, &control); trans->block_rsv = &fs_info->trans_block_rsv; @@ -5342,7 +5508,7 @@ void btrfs_evict_inode(struct inode *inode) */ btrfs_btree_balance_dirty_nodelay(fs_info); if (ret && ret != -ENOSPC && ret != -EAGAIN) - goto out; + goto out_release; else if (!ret) break; } @@ -5356,16 +5522,17 @@ void btrfs_evict_inode(struct inode *inode) * If it turns out that we are dropping too many of these, we might want * to add a mechanism for retrying these after a commit. */ - trans = evict_refill_and_join(root, rsv); + trans = evict_refill_and_join(root, &rsv); if (!IS_ERR(trans)) { - trans->block_rsv = rsv; + trans->block_rsv = &rsv; btrfs_orphan_del(trans, BTRFS_I(inode)); trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); } +out_release: + btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL); out: - btrfs_free_block_rsv(fs_info, rsv); /* * If we didn't successfully delete, the orphan item will still be in * the tree and we'll retry on the next mount. Again, we might also want @@ -5387,7 +5554,7 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, struct btrfs_key *location, u8 *type) { struct btrfs_dir_item *di; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = dir->root; int ret = 0; struct fscrypt_name fname; @@ -5398,7 +5565,7 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); if (ret < 0) - goto out; + return ret; /* * fscrypt_setup_filename() should never return a positive value, but * gcc on sparc/parisc thinks it can, so assert that doesn't happen. @@ -5427,7 +5594,6 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, *type = btrfs_dir_ftype(path->nodes[0], di); out: fscrypt_free_filename(&fname); - btrfs_free_path(path); return ret; } @@ -5442,7 +5608,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, struct btrfs_key *location, struct btrfs_root **sub_root) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *new_root; struct btrfs_root_ref *ref; struct extent_buffer *leaf; @@ -5498,7 +5664,6 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, location->offset = 0; err = 0; out: - btrfs_free_path(path); fscrypt_free_filename(&fname); return err; } @@ -5549,7 +5714,7 @@ static int btrfs_find_actor(struct inode *inode, void *opaque) args->root == BTRFS_I(inode)->root; } -static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) +static struct btrfs_inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; @@ -5561,40 +5726,42 @@ static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor, btrfs_init_locked_inode, (void *)&args); - return inode; + if (!inode) + return NULL; + return BTRFS_I(inode); } /* * Get an inode object given its inode number and corresponding root. Path is * preallocated to prevent recursing back to iget through allocator. */ -struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, - struct btrfs_path *path) +struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, + struct btrfs_path *path) { - struct inode *inode; + struct btrfs_inode *inode; int ret; inode = btrfs_iget_locked(ino, root); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode->vfs_inode.i_state & I_NEW)) return inode; ret = btrfs_read_locked_inode(inode, path); if (ret) return ERR_PTR(ret); - unlock_new_inode(inode); + unlock_new_inode(&inode->vfs_inode); return inode; } /* * Get an inode object given its inode number and corresponding root. */ -struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) +struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root) { - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_path *path; int ret; @@ -5602,55 +5769,60 @@ struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode->vfs_inode.i_state & I_NEW)) return inode; path = btrfs_alloc_path(); - if (!path) + if (!path) { + iget_failed(&inode->vfs_inode); return ERR_PTR(-ENOMEM); + } ret = btrfs_read_locked_inode(inode, path); btrfs_free_path(path); if (ret) return ERR_PTR(ret); - unlock_new_inode(inode); + unlock_new_inode(&inode->vfs_inode); return inode; } -static struct inode *new_simple_dir(struct inode *dir, - struct btrfs_key *key, - struct btrfs_root *root) +static struct btrfs_inode *new_simple_dir(struct inode *dir, + struct btrfs_key *key, + struct btrfs_root *root) { struct timespec64 ts; - struct inode *inode = new_inode(dir->i_sb); + struct inode *vfs_inode; + struct btrfs_inode *inode; - if (!inode) + vfs_inode = new_inode(dir->i_sb); + if (!vfs_inode) return ERR_PTR(-ENOMEM); - BTRFS_I(inode)->root = btrfs_grab_root(root); - BTRFS_I(inode)->ref_root_id = key->objectid; - set_bit(BTRFS_INODE_ROOT_STUB, &BTRFS_I(inode)->runtime_flags); - set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); + inode = BTRFS_I(vfs_inode); + inode->root = btrfs_grab_root(root); + inode->ref_root_id = key->objectid; + set_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags); + set_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags); - btrfs_set_inode_number(BTRFS_I(inode), BTRFS_EMPTY_SUBVOL_DIR_OBJECTID); + btrfs_set_inode_number(inode, BTRFS_EMPTY_SUBVOL_DIR_OBJECTID); /* * We only need lookup, the rest is read-only and there's no inode * associated with the dentry */ - inode->i_op = &simple_dir_inode_operations; - inode->i_opflags &= ~IOP_XATTR; - inode->i_fop = &simple_dir_operations; - inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; + vfs_inode->i_op = &simple_dir_inode_operations; + vfs_inode->i_opflags &= ~IOP_XATTR; + vfs_inode->i_fop = &simple_dir_operations; + vfs_inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; - ts = inode_set_ctime_current(inode); - inode_set_mtime_to_ts(inode, ts); - inode_set_atime_to_ts(inode, inode_get_atime(dir)); - BTRFS_I(inode)->i_otime_sec = ts.tv_sec; - BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec; + ts = inode_set_ctime_current(vfs_inode); + inode_set_mtime_to_ts(vfs_inode, ts); + inode_set_atime_to_ts(vfs_inode, inode_get_atime(dir)); + inode->i_otime_sec = ts.tv_sec; + inode->i_otime_nsec = ts.tv_nsec; - inode->i_uid = dir->i_uid; - inode->i_gid = dir->i_gid; + vfs_inode->i_uid = dir->i_uid; + vfs_inode->i_gid = dir->i_gid; return inode; } @@ -5664,15 +5836,15 @@ static_assert(BTRFS_FT_FIFO == FT_FIFO); static_assert(BTRFS_FT_SOCK == FT_SOCK); static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK); -static inline u8 btrfs_inode_type(struct inode *inode) +static inline u8 btrfs_inode_type(const struct btrfs_inode *inode) { - return fs_umode_to_ftype(inode->i_mode); + return fs_umode_to_ftype(inode->vfs_inode.i_mode); } struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) { struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *sub_root = root; struct btrfs_key location = { 0 }; @@ -5689,18 +5861,18 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (location.type == BTRFS_INODE_ITEM_KEY) { inode = btrfs_iget(location.objectid, root); if (IS_ERR(inode)) - return inode; + return ERR_CAST(inode); /* Do extra check against inode mode with di_type */ if (btrfs_inode_type(inode) != di_type) { btrfs_crit(fs_info, "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u", - inode->i_mode, btrfs_inode_type(inode), + inode->vfs_inode.i_mode, btrfs_inode_type(inode), di_type); - iput(inode); + iput(&inode->vfs_inode); return ERR_PTR(-EUCLEAN); } - return inode; + return &inode->vfs_inode; } ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry, @@ -5715,19 +5887,22 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) btrfs_put_root(sub_root); if (IS_ERR(inode)) - return inode; + return ERR_CAST(inode); down_read(&fs_info->cleanup_work_sem); - if (!sb_rdonly(inode->i_sb)) + if (!sb_rdonly(inode->vfs_inode.i_sb)) ret = btrfs_orphan_cleanup(sub_root); up_read(&fs_info->cleanup_work_sem); if (ret) { - iput(inode); + iput(&inode->vfs_inode); inode = ERR_PTR(ret); } } - return inode; + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return &inode->vfs_inode; } static int btrfs_dentry_delete(const struct dentry *dentry) @@ -5767,7 +5942,7 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; struct btrfs_key key, found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; int ret; @@ -5781,15 +5956,14 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; /* FIXME: we should be able to handle this */ if (ret == 0) - goto out; - ret = 0; + return ret; if (path->slots[0] == 0) { inode->index_cnt = BTRFS_DIR_START_INDEX; - goto out; + return 0; } path->slots[0]--; @@ -5800,13 +5974,12 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) if (found_key.objectid != btrfs_ino(inode) || found_key.type != BTRFS_DIR_INDEX_KEY) { inode->index_cnt = BTRFS_DIR_START_INDEX; - goto out; + return 0; } inode->index_cnt = found_key.offset + 1; -out: - btrfs_free_path(path); - return ret; + + return 0; } static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index) @@ -5909,7 +6082,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) struct btrfs_dir_item *di; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); void *addr; LIST_HEAD(ins_list); LIST_HEAD(del_list); @@ -5992,8 +6165,7 @@ again: if (ret) goto nopos; - ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); - if (ret) + if (btrfs_readdir_delayed_dir_index(ctx, &ins_list)) goto nopos; /* @@ -6022,7 +6194,6 @@ nopos: err: if (put) btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list); - btrfs_free_path(path); return ret; } @@ -6200,7 +6371,7 @@ static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode * inode->flags |= BTRFS_INODE_NODATASUM; } - btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + btrfs_sync_inode_flags_to_i_flags(inode); } int btrfs_create_new_inode(struct btrfs_trans_handle *trans, @@ -6286,6 +6457,8 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (btrfs_test_opt(fs_info, NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; + btrfs_update_inode_mapping_flags(BTRFS_I(inode)); + btrfs_set_inode_mapping_order(BTRFS_I(inode)); } ret = btrfs_insert_inode_locked(inode); @@ -6370,7 +6543,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, } } - btrfs_mark_buffer_dirty(trans, path->nodes[0]); /* * We don't need the path anymore, plus inheriting properties, adding * ACLs, security xattrs, orphan item or adding the link, will result in @@ -6380,7 +6552,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, path = NULL; if (args->subvol) { - struct inode *parent; + struct btrfs_inode *parent; /* * Subvolumes inherit properties from their parent subvolume, @@ -6390,11 +6562,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (IS_ERR(parent)) { ret = PTR_ERR(parent); } else { - ret = btrfs_inode_inherit_props(trans, inode, parent); - iput(parent); + ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode), + parent); + iput(&parent->vfs_inode); } } else { - ret = btrfs_inode_inherit_props(trans, inode, dir); + ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode), + BTRFS_I(dir)); } if (ret) { btrfs_err(fs_info, @@ -6428,13 +6602,17 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (args->orphan) { ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto discard; + } } else { ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 0, BTRFS_I(inode)->dir_index); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto discard; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto discard; + } } return 0; @@ -6492,7 +6670,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, return ret; ret = btrfs_insert_dir_item(trans, name, parent_inode, &key, - btrfs_inode_type(&inode->vfs_inode), index); + btrfs_inode_type(inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; else if (ret) { @@ -6521,20 +6699,18 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, fail_dir_item: if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { u64 local_index; - int err; - err = btrfs_del_root_ref(trans, key.objectid, - btrfs_root_id(root), parent_ino, - &local_index, name); - if (err) - btrfs_abort_transaction(trans, err); + int ret2; + + ret2 = btrfs_del_root_ref(trans, key.objectid, btrfs_root_id(root), + parent_ino, &local_index, name); + if (ret2) + btrfs_abort_transaction(trans, ret2); } else if (add_backref) { - u64 local_index; - int err; + int ret2; - err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino, - &local_index); - if (err) - btrfs_abort_transaction(trans, err); + ret2 = btrfs_del_inode_ref(trans, root, name, ino, parent_ino, NULL); + if (ret2) + btrfs_abort_transaction(trans, ret2); } /* Return the original error code */ @@ -6553,20 +6729,20 @@ static int btrfs_create_common(struct inode *dir, struct dentry *dentry, }; unsigned int trans_num_items; struct btrfs_trans_handle *trans; - int err; + int ret; - err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); - if (err) + ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); + if (ret) goto out_inode; trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_new_inode_args; } - err = btrfs_create_new_inode(trans, &new_inode_args); - if (!err) + ret = btrfs_create_new_inode(trans, &new_inode_args); + if (!ret) d_instantiate_new(dentry, inode); btrfs_end_transaction(trans); @@ -6574,9 +6750,9 @@ static int btrfs_create_common(struct inode *dir, struct dentry *dentry, out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: - if (err) + if (ret) iput(inode); - return err; + return ret; } static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir, @@ -6617,7 +6793,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct fscrypt_name fname; u64 index; - int err; + int ret; int drop_inode = 0; /* do not allow sys_link's with other subvols of the same device */ @@ -6627,12 +6803,12 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink >= BTRFS_LINK_MAX) return -EMLINK; - err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); - if (err) + ret = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); + if (ret) goto fail; - err = btrfs_set_inode_index(BTRFS_I(dir), &index); - if (err) + ret = btrfs_set_inode_index(BTRFS_I(dir), &index); + if (ret) goto fail; /* @@ -6643,7 +6819,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, */ trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); trans = NULL; goto fail; } @@ -6656,24 +6832,24 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, ihold(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); - err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), + ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), &fname.disk_name, 1, index); - if (err) { + if (ret) { drop_inode = 1; } else { struct dentry *parent = dentry->d_parent; - err = btrfs_update_inode(trans, BTRFS_I(inode)); - if (err) + ret = btrfs_update_inode(trans, BTRFS_I(inode)); + if (ret) goto fail; if (inode->i_nlink == 1) { /* * If new hard link count is 1, it's a file created * with open(2) O_TMPFILE flag. */ - err = btrfs_orphan_del(trans, BTRFS_I(inode)); - if (err) + ret = btrfs_orphan_del(trans, BTRFS_I(inode)); + if (ret) goto fail; } d_instantiate(dentry, inode); @@ -6689,21 +6865,21 @@ fail: iput(inode); } btrfs_btree_balance_dirty(fs_info); - return err; + return ret; } -static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; inode = new_inode(dir->i_sb); if (!inode) - return -ENOMEM; + return ERR_PTR(-ENOMEM); inode_init_owner(idmap, inode, dir, S_IFDIR | mode); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; - return btrfs_create_common(dir, dentry, inode); + return ERR_PTR(btrfs_create_common(dir, dentry, inode)); } static noinline int uncompress_inline(struct btrfs_path *path, @@ -6712,6 +6888,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, { int ret; struct extent_buffer *leaf = path->nodes[0]; + const u32 blocksize = leaf->fs_info->sectorsize; char *tmp; size_t max_size; unsigned long inline_size; @@ -6728,7 +6905,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); - max_size = min_t(unsigned long, PAGE_SIZE, max_size); + max_size = min_t(unsigned long, blocksize, max_size); ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size, max_size); @@ -6740,14 +6917,15 @@ static noinline int uncompress_inline(struct btrfs_path *path, * cover that region here. */ - if (max_size < PAGE_SIZE) - folio_zero_range(folio, max_size, PAGE_SIZE - max_size); + if (max_size < blocksize) + folio_zero_range(folio, max_size, blocksize - max_size); kfree(tmp); return ret; } static int read_inline_extent(struct btrfs_path *path, struct folio *folio) { + const u32 blocksize = path->nodes[0]->fs_info->sectorsize; struct btrfs_file_extent_item *fi; void *kaddr; size_t copy_size; @@ -6762,14 +6940,14 @@ static int read_inline_extent(struct btrfs_path *path, struct folio *folio) if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) return uncompress_inline(path, folio, fi); - copy_size = min_t(u64, PAGE_SIZE, + copy_size = min_t(u64, blocksize, btrfs_file_extent_ram_bytes(path->nodes[0], fi)); kaddr = kmap_local_folio(folio, 0); read_extent_buffer(path->nodes[0], kaddr, btrfs_file_extent_inline_start(fi), copy_size); kunmap_local(kaddr); - if (copy_size < PAGE_SIZE) - folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size); + if (copy_size < blocksize) + folio_zero_range(folio, copy_size, blocksize - copy_size); return 0; } @@ -6808,18 +6986,18 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct extent_map_tree *em_tree = &inode->extent_tree; read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); + em = btrfs_lookup_extent_mapping(em_tree, start, len); read_unlock(&em_tree->lock); if (em) { if (em->start > start || em->start + em->len <= start) - free_extent_map(em); + btrfs_free_extent_map(em); else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio) - free_extent_map(em); + btrfs_free_extent_map(em); else goto out; } - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { ret = -ENOMEM; goto out; @@ -6956,7 +7134,7 @@ not_found: insert: ret = 0; btrfs_release_path(path); - if (em->start > start || extent_map_end(em) <= start) { + if (em->start > start || btrfs_extent_map_end(em) <= start) { btrfs_err(fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", em->start, em->len, start, len); @@ -6973,7 +7151,7 @@ out: trace_btrfs_get_extent(root, inode, em); if (ret) { - free_extent_map(em); + btrfs_free_extent_map(em); return ERR_PTR(ret); } return em; @@ -7001,8 +7179,6 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) * @orig_start: (optional) Return the original file offset of the file extent * @orig_len: (optional) Return the original on-disk length of the file extent * @ram_bytes: (optional) Return the ram_bytes of the file extent - * @strict: if true, omit optimizations that might force us into unnecessary - * cow. e.g., don't trust generation number. * * Return: * >0 and update @len if we can do nocow write @@ -7012,17 +7188,17 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) * NOTE: This only checks the file extents, caller is responsible to wait for * any ordered extents. */ -noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, +noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, - bool nowait, bool strict) + bool nowait) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct can_nocow_file_extent_args nocow_args = { 0 }; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; struct extent_buffer *leaf; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_file_extent_item *fi; struct btrfs_key key; int found_type; @@ -7032,81 +7208,74 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, return -ENOMEM; path->nowait = nowait; - ret = btrfs_lookup_file_extent(NULL, root, path, - btrfs_ino(BTRFS_I(inode)), offset, 0); + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), + offset, 0); if (ret < 0) - goto out; + return ret; if (ret == 1) { if (path->slots[0] == 0) { - /* can't find the item, must cow */ - ret = 0; - goto out; + /* Can't find the item, must COW. */ + return 0; } path->slots[0]--; } ret = 0; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != btrfs_ino(BTRFS_I(inode)) || + if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) { - /* not our file or wrong item type, must cow */ - goto out; + /* Not our file or wrong item type, must COW. */ + return 0; } if (key.offset > offset) { - /* Wrong offset, must cow */ - goto out; + /* Wrong offset, must COW. */ + return 0; } if (btrfs_file_extent_end(path) <= offset) - goto out; + return 0; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); found_type = btrfs_file_extent_type(leaf, fi); nocow_args.start = offset; nocow_args.end = offset + *len - 1; - nocow_args.strict = strict; nocow_args.free_path = true; - ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args); + ret = can_nocow_file_extent(path, &key, inode, &nocow_args); /* can_nocow_file_extent() has freed the path. */ path = NULL; if (ret != 1) { /* Treat errors as not being able to NOCOW. */ - ret = 0; - goto out; + return 0; } - ret = 0; if (btrfs_extent_readonly(fs_info, nocow_args.file_extent.disk_bytenr + nocow_args.file_extent.offset)) - goto out; + return 0; - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + if (!(inode->flags & BTRFS_INODE_NODATACOW) && found_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 range_end; range_end = round_up(offset + nocow_args.file_extent.num_bytes, root->fs_info->sectorsize) - 1; - ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC); - if (ret) { - ret = -EAGAIN; - goto out; - } + ret = btrfs_test_range_bit_exists(io_tree, offset, range_end, + EXTENT_DELALLOC); + if (ret) + return -EAGAIN; } if (file_extent) memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent)); *len = nocow_args.file_extent.num_bytes; - ret = 1; -out: - btrfs_free_path(path); - return ret; + + return 1; } /* The callers of this must take lock_extent() */ @@ -7154,7 +7323,7 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, break; } - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) return ERR_PTR(-ENOMEM); @@ -7167,15 +7336,15 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, em->offset = file_extent->offset; em->flags |= EXTENT_FLAG_PINNED; if (type == BTRFS_ORDERED_COMPRESSED) - extent_map_set_compression(em, file_extent->compression); + btrfs_extent_map_set_compression(em, file_extent->compression); ret = btrfs_replace_extent_map_range(inode, em, true); if (ret) { - free_extent_map(em); + btrfs_free_extent_map(em); return ERR_PTR(ret); } - /* em got 2 refs now, callers needs to do free_extent_map once. */ + /* em got 2 refs now, callers needs to do btrfs_free_extent_map once. */ return em; } @@ -7189,13 +7358,13 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, static void wait_subpage_spinlock(struct folio *folio) { struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_is_subpage(fs_info, folio)) return; ASSERT(folio_test_private(folio) && folio_get_private(folio)); - subpage = folio_get_private(folio); + bfs = folio_get_private(folio); /* * This may look insane as we just acquire the spinlock and release it, @@ -7208,14 +7377,14 @@ static void wait_subpage_spinlock(struct folio *folio) * Here we just acquire the spinlock so that all existing callers * should exit and we're safe to release/invalidate the page. */ - spin_lock_irq(&subpage->lock); - spin_unlock_irq(&subpage->lock); + spin_lock_irq(&bfs->lock); + spin_unlock_irq(&bfs->lock); } static int btrfs_launder_folio(struct folio *folio) { return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio), - PAGE_SIZE, NULL); + folio_size(folio), NULL); } static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) @@ -7302,7 +7471,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, } if (!inode_evicting) - lock_extent(tree, page_start, page_end, &cached_state); + btrfs_lock_extent(tree, page_start, page_end, &cached_state); cur = page_start; while (cur < page_end) { @@ -7358,10 +7527,10 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * btrfs_finish_ordered_io(). */ if (!inode_evicting) - clear_extent_bit(tree, cur, range_end, - EXTENT_DELALLOC | - EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, &cached_state); + btrfs_clear_extent_bit(tree, cur, range_end, + EXTENT_DELALLOC | + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, &cached_state); spin_lock_irq(&inode->ordered_tree_lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); @@ -7403,12 +7572,11 @@ next: * Since the IO will never happen for this page. */ btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL); - if (!inode_evicting) { - clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | - EXTENT_DELALLOC | EXTENT_UPTODATE | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG | - extra_flags, &cached_state); - } + if (!inode_evicting) + btrfs_clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG | extra_flags, + &cached_state); cur = range_end + 1; } /* @@ -7433,7 +7601,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *rsv; + struct btrfs_block_rsv rsv; int ret; struct btrfs_trans_handle *trans; u64 mask = fs_info->sectorsize - 1; @@ -7475,11 +7643,9 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) * 2) fs_info->trans_block_rsv - this will have 1 items worth left for * updating the inode. */ - rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); - if (!rsv) - return -ENOMEM; - rsv->size = min_size; - rsv->failfast = true; + btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP); + rsv.size = min_size; + rsv.failfast = true; /* * 1 for the truncate slack space @@ -7492,7 +7658,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) } /* Migrate the slack space for the truncate to our reserve */ - ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, + ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv, min_size, false); /* * We have reserved 2 metadata units when we started the transaction and @@ -7504,7 +7670,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) goto out; } - trans->block_rsv = rsv; + trans->block_rsv = &rsv; while (1) { struct extent_state *cached_state = NULL; @@ -7512,7 +7678,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); control.new_size = new_size; - lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); + btrfs_lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); /* * We want to drop from the next block forward in case this new * size is not block aligned since we will be keeping the last @@ -7527,7 +7693,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); btrfs_inode_safe_disk_i_size_write(inode, control.last_size); - unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); + btrfs_unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) @@ -7547,9 +7713,9 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) break; } - btrfs_block_rsv_release(fs_info, rsv, -1, NULL); + btrfs_block_rsv_release(fs_info, &rsv, -1, NULL); ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, - rsv, min_size, false); + &rsv, min_size, false); /* * We have reserved 2 metadata units when we started the * transaction and min_size matches 1 unit, so this should never @@ -7558,7 +7724,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) if (WARN_ON(ret)) break; - trans->block_rsv = rsv; + trans->block_rsv = &rsv; } /* @@ -7571,7 +7737,8 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); - ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0); + ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, + inode->vfs_inode.i_size, (u64)-1); if (ret) goto out; trans = btrfs_start_transaction(root, 1); @@ -7596,7 +7763,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) btrfs_btree_balance_dirty(fs_info); } out: - btrfs_free_block_rsv(fs_info, rsv); + btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL); /* * So if we truncate and then write and fsync we normally would just * write the extents that changed, which is a problem if we need to @@ -7683,10 +7850,10 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->i_otime_nsec = 0; inode = &ei->vfs_inode; - extent_map_tree_init(&ei->extent_tree); + btrfs_extent_map_tree_init(&ei->extent_tree); /* This io tree sets the valid inode. */ - extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); + btrfs_extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); ei->io_tree.inode = ei; ei->file_extent_tree = NULL; @@ -7851,7 +8018,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap, generic_fillattr(idmap, request_mask, inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; - stat->subvol = BTRFS_I(inode)->root->root_key.objectid; + stat->subvol = btrfs_root_id(BTRFS_I(inode)->root); stat->result_mask |= STATX_SUBVOL; spin_lock(&BTRFS_I(inode)->lock); @@ -7884,6 +8051,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, int ret; int ret2; bool need_abort = false; + bool logs_pinned = false; struct fscrypt_name old_fname, new_fname; struct fscrypt_str *old_name, *new_name; @@ -8007,6 +8175,31 @@ static int btrfs_rename_exchange(struct inode *old_dir, inode_inc_iversion(new_inode); simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); + if (old_ino != BTRFS_FIRST_FREE_OBJECTID && + new_ino != BTRFS_FIRST_FREE_OBJECTID) { + /* + * If we are renaming in the same directory (and it's not for + * root entries) pin the log early to prevent any concurrent + * task from logging the directory after we removed the old + * entries and before we add the new entries, otherwise that + * task can sync a log without any entry for the inodes we are + * renaming and therefore replaying that log, if a power failure + * happens after syncing the log, would result in deleting the + * inodes. + * + * If the rename affects two different directories, we want to + * make sure the that there's no log commit that contains + * updates for only one of the directories but not for the + * other. + * + * If we are renaming an entry for a root, we don't care about + * log updates since we called btrfs_set_log_full_commit(). + */ + btrfs_pin_log_trans(root); + btrfs_pin_log_trans(dest); + logs_pinned = true; + } + if (old_dentry->d_parent != new_dentry->d_parent) { btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), BTRFS_I(old_inode), true); @@ -8017,31 +8210,45 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } else { /* src is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), old_name, &old_rename_ctx); - if (!ret) - ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } /* dest is a subvolume */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } else { /* dest is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), new_name, &new_rename_ctx); - if (!ret) - ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), @@ -8064,30 +8271,23 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode)->dir_index = new_idx; /* - * Now pin the logs of the roots. We do it to ensure that no other task - * can sync the logs while we are in progress with the rename, because - * that could result in an inconsistency in case any of the inodes that - * are part of this rename operation were logged before. + * Do the log updates for all inodes. + * + * If either entry is for a root we don't need to update the logs since + * we've called btrfs_set_log_full_commit() before. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) - btrfs_pin_log_trans(root); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) - btrfs_pin_log_trans(dest); - - /* Do the log updates for all inodes. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + if (logs_pinned) { btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), old_rename_ctx.index, new_dentry->d_parent); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir), new_rename_ctx.index, old_dentry->d_parent); + } - /* Now unpin the logs. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) +out_fail: + if (logs_pinned) { btrfs_end_log_trans(root); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_end_log_trans(dest); -out_fail: + } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: @@ -8137,6 +8337,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); struct fscrypt_name old_fname, new_fname; + bool logs_pinned = false; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -8271,22 +8472,52 @@ static int btrfs_rename(struct mnt_idmap *idmap, inode_inc_iversion(old_inode); simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { + /* + * If we are renaming in the same directory (and it's not a + * root entry) pin the log to prevent any concurrent task from + * logging the directory after we removed the old entry and + * before we add the new entry, otherwise that task can sync + * a log without any entry for the inode we are renaming and + * therefore replaying that log, if a power failure happens + * after syncing the log, would result in deleting the inode. + * + * If the rename affects two different directories, we want to + * make sure the that there's no log commit that contains + * updates for only one of the directories but not for the + * other. + * + * If we are renaming an entry for a root, we don't care about + * log updates since we called btrfs_set_log_full_commit(). + */ + btrfs_pin_log_trans(root); + btrfs_pin_log_trans(dest); + logs_pinned = true; + } + if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), BTRFS_I(old_inode), true); if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } else { ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), &old_fname.disk_name, &rename_ctx); - if (!ret) - ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } if (new_inode) { @@ -8294,18 +8525,27 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } BUG_ON(new_inode->i_nlink == 0); } else { ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), &new_fname.disk_name); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } - if (!ret && new_inode->i_nlink == 0) + if (new_inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, BTRFS_I(d_inode(new_dentry))); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } } @@ -8319,7 +8559,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = index; - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + if (logs_pinned) btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), rename_ctx.index, new_dentry->d_parent); @@ -8335,6 +8575,10 @@ static int btrfs_rename(struct mnt_idmap *idmap, } } out_fail: + if (logs_pinned) { + btrfs_end_log_trans(root); + btrfs_end_log_trans(dest); + } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: @@ -8420,8 +8664,6 @@ static int start_delalloc_inodes(struct btrfs_root *root, struct writeback_control *wbc, bool snapshot, bool in_reclaim_context) { - struct btrfs_inode *binode; - struct inode *inode; struct btrfs_delalloc_work *work, *next; LIST_HEAD(works); LIST_HEAD(splice); @@ -8432,30 +8674,30 @@ static int start_delalloc_inodes(struct btrfs_root *root, spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { - binode = list_entry(splice.next, struct btrfs_inode, - delalloc_inodes); + struct btrfs_inode *inode; + struct inode *tmp_inode; - list_move_tail(&binode->delalloc_inodes, - &root->delalloc_inodes); + inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes); + + list_move_tail(&inode->delalloc_inodes, &root->delalloc_inodes); if (in_reclaim_context && - test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags)) + test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags)) continue; - inode = igrab(&binode->vfs_inode); - if (!inode) { + tmp_inode = igrab(&inode->vfs_inode); + if (!tmp_inode) { cond_resched_lock(&root->delalloc_lock); continue; } spin_unlock(&root->delalloc_lock); if (snapshot) - set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, - &binode->runtime_flags); + set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags); if (full_flush) { - work = btrfs_alloc_delalloc_work(inode); + work = btrfs_alloc_delalloc_work(&inode->vfs_inode); if (!work) { - iput(inode); + iput(&inode->vfs_inode); ret = -ENOMEM; goto out; } @@ -8463,8 +8705,8 @@ static int start_delalloc_inodes(struct btrfs_root *root, btrfs_queue_work(root->fs_info->flush_workers, &work->work); } else { - ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc); - btrfs_add_delayed_iput(BTRFS_I(inode)); + ret = filemap_fdatawrite_wbc(inode->vfs_inode.i_mapping, wbc); + btrfs_add_delayed_iput(inode); if (ret || wbc->nr_to_write <= 0) goto out; } @@ -8573,7 +8815,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, .dentry = dentry, }; unsigned int trans_num_items; - int err; + int ret; int name_len; int datasize; unsigned long ptr; @@ -8581,7 +8823,12 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct extent_buffer *leaf; name_len = strlen(symname); - if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) + /* + * Symlinks utilize uncompressed inline extent data, which should not + * reach block size. + */ + if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || + name_len >= fs_info->sectorsize) return -ENAMETOOLONG; inode = new_inode(dir->i_sb); @@ -8595,38 +8842,37 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, inode_set_bytes(inode, name_len); new_inode_args.inode = inode; - err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); - if (err) + ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); + if (ret) goto out_inode; /* 1 additional item for the inline extent */ trans_num_items++; trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_new_inode_args; } - err = btrfs_create_new_inode(trans, &new_inode_args); - if (err) + ret = btrfs_create_new_inode(trans, &new_inode_args); + if (ret) goto out; path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; - btrfs_abort_transaction(trans, err); + ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); discard_new_inode(inode); inode = NULL; goto out; } key.objectid = btrfs_ino(BTRFS_I(inode)); - key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; datasize = btrfs_file_extent_calc_inline_size(name_len); - err = btrfs_insert_empty_item(trans, root, path, &key, - datasize); - if (err) { - btrfs_abort_transaction(trans, err); + ret = btrfs_insert_empty_item(trans, root, path, &key, datasize); + if (ret) { + btrfs_abort_transaction(trans, ret); btrfs_free_path(path); discard_new_inode(inode); inode = NULL; @@ -8645,20 +8891,19 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, ptr = btrfs_file_extent_inline_start(ei); write_extent_buffer(leaf, symname, ptr, name_len); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); d_instantiate_new(dentry, inode); - err = 0; + ret = 0; out: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: - if (err) + if (ret) iput(inode); - return err; + return ret; } static struct btrfs_trans_handle *insert_prealloc_file_extent( @@ -8795,11 +9040,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_free_reserved_extent(fs_info, ins.objectid, - ins.offset, 0); + ins.offset, false); break; } - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset, cur_offset + ins.offset - 1, false); @@ -8817,7 +9062,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, em->generation = trans->transid; ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true); - free_extent_map(em); + btrfs_free_extent_map(em); next: num_bytes -= ins.offset; cur_offset += ins.offset; @@ -8989,7 +9234,7 @@ static ssize_t btrfs_encoded_read_inline( struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_file_extent_item *item; u64 ram_bytes; @@ -8999,10 +9244,8 @@ static ssize_t btrfs_encoded_read_inline( const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; path->nowait = nowait; @@ -9011,9 +9254,9 @@ static ssize_t btrfs_encoded_read_inline( if (ret) { if (ret > 0) { /* The extent item disappeared? */ - ret = -EIO; + return -EIO; } - goto out; + return ret; } leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -9026,17 +9269,16 @@ static ssize_t btrfs_encoded_read_inline( ret = btrfs_encoded_io_compression_from_extent(fs_info, btrfs_file_extent_compression(leaf, item)); if (ret < 0) - goto out; + return ret; encoded->compression = ret; if (encoded->compression) { size_t inline_size; inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); - if (inline_size > count) { - ret = -ENOBUFS; - goto out; - } + if (inline_size > count) + return -ENOBUFS; + count = inline_size; encoded->unencoded_len = ram_bytes; encoded->unencoded_offset = iocb->ki_pos - extent_start; @@ -9048,13 +9290,12 @@ static ssize_t btrfs_encoded_read_inline( } tmp = kmalloc(count, GFP_NOFS); - if (!tmp) { - ret = -ENOMEM; - goto out; - } + if (!tmp) + return -ENOMEM; + read_extent_buffer(leaf, tmp, ptr, count); btrfs_release_path(path); - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); *unlocked = true; @@ -9062,15 +9303,14 @@ static ssize_t btrfs_encoded_read_inline( if (ret != count) ret = -EFAULT; kfree(tmp); -out: - btrfs_free_path(path); + return ret; } struct btrfs_encoded_read_private { - wait_queue_head_t wait; + struct completion *sync_reads; void *uring_ctx; - atomic_t pending; + refcount_t pending_refs; blk_status_t status; }; @@ -9080,23 +9320,22 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) if (bbio->bio.bi_status) { /* - * The memory barrier implied by the atomic_dec_return() here - * pairs with the memory barrier implied by the - * atomic_dec_return() or io_wait_event() in - * btrfs_encoded_read_regular_fill_pages() to ensure that this - * write is observed before the load of status in + * The memory barrier implied by the refcount_dec_and_test() here + * pairs with the memory barrier implied by the refcount_dec_and_test() + * in btrfs_encoded_read_regular_fill_pages() to ensure that + * this write is observed before the load of status in * btrfs_encoded_read_regular_fill_pages(). */ WRITE_ONCE(priv->status, bbio->bio.bi_status); } - if (atomic_dec_return(&priv->pending) == 0) { + if (refcount_dec_and_test(&priv->pending_refs)) { int err = blk_status_to_errno(READ_ONCE(priv->status)); if (priv->uring_ctx) { btrfs_uring_read_extent_endio(priv->uring_ctx, err); kfree(priv); } else { - wake_up(&priv->wait); + complete(priv->sync_reads); } } bio_put(&bbio->bio); @@ -9107,17 +9346,27 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, struct page **pages, void *uring_ctx) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_encoded_read_private *priv; + struct btrfs_encoded_read_private *priv, sync_priv; + struct completion sync_reads; unsigned long i = 0; struct btrfs_bio *bbio; int ret; - priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS); - if (!priv) - return -ENOMEM; + /* + * Fast path for synchronous reads which completes in this call, io_uring + * needs longer time span. + */ + if (uring_ctx) { + priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS); + if (!priv) + return -ENOMEM; + } else { + priv = &sync_priv; + init_completion(&sync_reads); + priv->sync_reads = &sync_reads; + } - init_waitqueue_head(&priv->wait); - atomic_set(&priv->pending, 1); + refcount_set(&priv->pending_refs, 1); priv->status = 0; priv->uring_ctx = uring_ctx; @@ -9130,7 +9379,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { - atomic_inc(&priv->pending); + refcount_inc(&priv->pending_refs); btrfs_submit_bbio(bbio, 0); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, @@ -9145,11 +9394,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, disk_io_size -= bytes; } while (disk_io_size); - atomic_inc(&priv->pending); + refcount_inc(&priv->pending_refs); btrfs_submit_bbio(bbio, 0); if (uring_ctx) { - if (atomic_dec_return(&priv->pending) == 0) { + if (refcount_dec_and_test(&priv->pending_refs)) { ret = blk_status_to_errno(READ_ONCE(priv->status)); btrfs_uring_read_extent_endio(uring_ctx, ret); kfree(priv); @@ -9158,12 +9407,10 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, return -EIOCBQUEUED; } else { - if (atomic_dec_return(&priv->pending) != 0) - io_wait_event(priv->wait, !atomic_read(&priv->pending)); + if (!refcount_dec_and_test(&priv->pending_refs)) + wait_for_completion_io(&sync_reads); /* See btrfs_encoded_read_endio() for ordering. */ - ret = blk_status_to_errno(READ_ONCE(priv->status)); - kfree(priv); - return ret; + return blk_status_to_errno(READ_ONCE(priv->status)); } } @@ -9196,7 +9443,7 @@ ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, if (ret) goto out; - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); *unlocked = true; @@ -9273,7 +9520,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, goto out_unlock_inode; } - if (!try_lock_extent(io_tree, start, lockend, cached_state)) { + if (!btrfs_try_lock_extent(io_tree, start, lockend, cached_state)) { ret = -EAGAIN; goto out_unlock_inode; } @@ -9282,7 +9529,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, lockend - start + 1); if (ordered) { btrfs_put_ordered_extent(ordered); - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); ret = -EAGAIN; goto out_unlock_inode; } @@ -9295,13 +9542,13 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, if (ret) goto out_unlock_inode; - lock_extent(io_tree, start, lockend, cached_state); + btrfs_lock_extent(io_tree, start, lockend, cached_state); ordered = btrfs_lookup_ordered_range(inode, start, lockend - start + 1); if (!ordered) break; btrfs_put_ordered_extent(ordered); - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); cond_resched(); } } @@ -9319,7 +9566,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, * For inline extents we get everything we need out of the * extent item. */ - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, cached_state, extent_start, @@ -9331,7 +9578,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, * We only want to return up to EOF even if the extent extends beyond * that. */ - encoded->len = min_t(u64, extent_map_end(em), + encoded->len = min_t(u64, btrfs_extent_map_end(em), inode->vfs_inode.i_size) - iocb->ki_pos; if (em->disk_bytenr == EXTENT_MAP_HOLE || (em->flags & EXTENT_FLAG_PREALLOC)) { @@ -9339,7 +9586,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, count = min_t(u64, count, encoded->len); encoded->len = count; encoded->unencoded_len = count; - } else if (extent_map_is_compressed(em)) { + } else if (btrfs_extent_map_is_compressed(em)) { *disk_bytenr = em->disk_bytenr; /* * Bail if the buffer isn't large enough to return the whole @@ -9354,12 +9601,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, encoded->unencoded_len = em->ram_bytes; encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset); ret = btrfs_encoded_io_compression_from_extent(fs_info, - extent_map_compression(em)); + btrfs_extent_map_compression(em)); if (ret < 0) goto out_em; encoded->compression = ret; } else { - *disk_bytenr = extent_map_block_start(em) + (start - em->start); + *disk_bytenr = btrfs_extent_map_block_start(em) + (start - em->start); if (encoded->len > count) encoded->len = count; /* @@ -9372,11 +9619,11 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, encoded->unencoded_len = count; *disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize); } - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; if (*disk_bytenr == EXTENT_MAP_HOLE) { - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); unlocked = true; ret = iov_iter_zero(count, iter); @@ -9388,11 +9635,11 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, } out_em: - free_extent_map(em); + btrfs_free_extent_map(em); out_unlock_extent: /* Leave inode and extent locked if we need to do a read. */ if (!unlocked && ret != -EIOCBQUEUED) - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); out_unlock_inode: if (!unlocked && ret != -EIOCBQUEUED) btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); @@ -9539,14 +9786,14 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, end >> PAGE_SHIFT); if (ret) goto out_folios; - lock_extent(io_tree, start, end, &cached_state); + btrfs_lock_extent(io_tree, start, end, &cached_state); ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); if (!ordered && !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end)) break; if (ordered) btrfs_put_ordered_extent(ordered); - unlock_extent(io_tree, start, end, &cached_state); + btrfs_unlock_extent(io_tree, start, end, &cached_state); cond_resched(); } @@ -9596,11 +9843,11 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, ret = PTR_ERR(em); goto out_free_reserved; } - free_extent_map(em); + btrfs_free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, - (1 << BTRFS_ORDERED_ENCODED) | - (1 << BTRFS_ORDERED_COMPRESSED)); + (1U << BTRFS_ORDERED_ENCODED) | + (1U << BTRFS_ORDERED_COMPRESSED)); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); @@ -9611,7 +9858,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, if (start + encoded->len > inode->vfs_inode.i_size) i_size_write(&inode->vfs_inode, start + encoded->len); - unlock_extent(io_tree, start, end, &cached_state); + btrfs_unlock_extent(io_tree, start, end, &cached_state); btrfs_delalloc_release_extents(inode, num_bytes); @@ -9621,7 +9868,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, out_free_reserved: btrfs_dec_block_group_reservations(fs_info, ins.objectid); - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); out_delalloc_release: btrfs_delalloc_release_extents(inode, num_bytes); btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); @@ -9634,9 +9881,9 @@ out_free_data_space: * bytes_may_use. */ if (!extent_reserved) - btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes); + btrfs_free_reserved_data_space_noquota(inode, disk_num_bytes); out_unlock: - unlock_extent(io_tree, start, end, &cached_state); + btrfs_unlock_extent(io_tree, start, end, &cached_state); out_folios: for (i = 0; i < nr_folios; i++) { if (folios[i]) @@ -9789,15 +10036,25 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; - struct extent_map *em = NULL; struct btrfs_chunk_map *map = NULL; struct btrfs_device *device = NULL; struct btrfs_swap_info bsi = { .lowest_ppage = (sector_t)-1ULL, }; + struct btrfs_backref_share_check_ctx *backref_ctx = NULL; + struct btrfs_path *path = NULL; int ret = 0; u64 isize; - u64 start; + u64 prev_extent_end = 0; + + /* + * Acquire the inode's mmap lock to prevent races with memory mapped + * writes, as they could happen after we flush delalloc below and before + * we lock the extent range further below. The inode was already locked + * up in the call chain. + */ + btrfs_assert_inode_locked(BTRFS_I(inode)); + down_write(&BTRFS_I(inode)->i_mmap_lock); /* * If the swap file was just created, make sure delalloc is done. If the @@ -9806,22 +10063,32 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, */ ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (ret) - return ret; + goto out_unlock_mmap; /* * The inode is locked, so these flags won't change after we check them. */ if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { btrfs_warn(fs_info, "swapfile must not be compressed"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { btrfs_warn(fs_info, "swapfile must not be checksummed"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; + } + + path = btrfs_alloc_path(); + backref_ctx = btrfs_alloc_backref_share_check_ctx(); + if (!path || !backref_ctx) { + ret = -ENOMEM; + goto out_unlock_mmap; } /* @@ -9836,7 +10103,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { btrfs_warn(fs_info, "cannot activate swapfile while exclusive operation is running"); - return -EBUSY; + ret = -EBUSY; + goto out_unlock_mmap; } /* @@ -9850,7 +10118,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because snapshot creation is in progress"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } /* * Snapshots can create extents which require COW even if NODATACOW is @@ -9866,36 +10135,53 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (btrfs_root_dead(root)) { spin_unlock(&root->root_item_lock); + btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because subvolume %llu is being deleted", btrfs_root_id(root)); - return -EPERM; + ret = -EPERM; + goto out_unlock_mmap; } atomic_inc(&root->nr_swapfiles); spin_unlock(&root->root_item_lock); isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); - lock_extent(io_tree, 0, isize - 1, &cached_state); - start = 0; - while (start < isize) { - u64 logical_block_start, physical_block_start; + btrfs_lock_extent(io_tree, 0, isize - 1, &cached_state); + while (prev_extent_end < isize) { + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *ei; struct btrfs_block_group *bg; - u64 len = isize - start; + u64 logical_block_start; + u64 physical_block_start; + u64 extent_gen; + u64 disk_bytenr; + u64 len; - em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); + key.objectid = btrfs_ino(BTRFS_I(inode)); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = prev_extent_end; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) goto out; - } - if (em->disk_bytenr == EXTENT_MAP_HOLE) { + /* + * If key not found it means we have an implicit hole (NO_HOLES + * is enabled). + */ + if (ret > 0) { btrfs_warn(fs_info, "swapfile must not have holes"); ret = -EINVAL; goto out; } - if (em->disk_bytenr == EXTENT_MAP_INLINE) { + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { /* * It's unlikely we'll ever actually find ourselves * here, as a file small enough to fit inline won't be @@ -9907,23 +10193,45 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, ret = -EINVAL; goto out; } - if (extent_map_is_compressed(em)) { + + if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { btrfs_warn(fs_info, "swapfile must not be compressed"); ret = -EINVAL; goto out; } - logical_block_start = extent_map_block_start(em) + (start - em->start); - len = min(len, em->len - (start - em->start)); - free_extent_map(em); - em = NULL; + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + if (disk_bytenr == 0) { + btrfs_warn(fs_info, "swapfile must not have holes"); + ret = -EINVAL; + goto out; + } - ret = can_nocow_extent(inode, start, &len, NULL, false, true); + logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei); + extent_gen = btrfs_file_extent_generation(leaf, ei); + prev_extent_end = btrfs_file_extent_end(path); + + if (prev_extent_end > isize) + len = isize - key.offset; + else + len = btrfs_file_extent_num_bytes(leaf, ei); + + backref_ctx->curr_leaf_bytenr = leaf->start; + + /* + * Don't need the path anymore, release to avoid deadlocks when + * calling btrfs_is_data_extent_shared() because when joining a + * transaction it can block waiting for the current one's commit + * which in turn may be trying to lock the same leaf to flush + * delayed items for example. + */ + btrfs_release_path(path); + + ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr, + extent_gen, backref_ctx); if (ret < 0) { goto out; - } else if (ret) { - ret = 0; - } else { + } else if (ret > 0) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); ret = -EINVAL; @@ -9958,7 +10266,6 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, physical_block_start = (map->stripes[0].physical + (logical_block_start - map->start)); - len = min(len, map->chunk_len - (logical_block_start - map->start)); btrfs_free_chunk_map(map); map = NULL; @@ -9999,24 +10306,27 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret) goto out; } - bsi.start = start; + bsi.start = key.offset; bsi.block_start = physical_block_start; bsi.block_len = len; } - start += len; + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + + cond_resched(); } if (bsi.block_len) ret = btrfs_add_swap_extent(sis, &bsi); out: - if (!IS_ERR_OR_NULL(em)) - free_extent_map(em); if (!IS_ERR_OR_NULL(map)) btrfs_free_chunk_map(map); - unlock_extent(io_tree, 0, isize - 1, &cached_state); + btrfs_unlock_extent(io_tree, 0, isize - 1, &cached_state); if (ret) btrfs_swap_deactivate(file); @@ -10025,6 +10335,10 @@ out: btrfs_exclop_finish(fs_info); +out_unlock_mmap: + up_write(&BTRFS_I(inode)->i_mmap_lock); + btrfs_free_backref_share_ctx(backref_ctx); + btrfs_free_path(path); if (ret) return ret; @@ -10033,7 +10347,6 @@ out: *span = bsi.highest_ppage - bsi.lowest_ppage + 1; sis->max = bsi.nr_pages; sis->pages = bsi.nr_pages - 1; - sis->highest_bit = bsi.nr_pages - 1; return bsi.nr_extents; } #else diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c9302d193187..7e13de2bdcbf 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -118,8 +118,8 @@ struct btrfs_ioctl_encoded_io_args_32 { #endif /* Mask out flags that are inappropriate for the given type of inode. */ -static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, - unsigned int flags) +static unsigned int btrfs_mask_fsflags_for_type(const struct inode *inode, + unsigned int flags) { if (S_ISDIR(inode->i_mode)) return flags; @@ -133,11 +133,11 @@ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS * ioctl. */ -static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode) +static unsigned int btrfs_inode_flags_to_fsflags(const struct btrfs_inode *inode) { unsigned int iflags = 0; - u32 flags = binode->flags; - u32 ro_flags = binode->ro_flags; + u32 flags = inode->flags; + u32 ro_flags = inode->ro_flags; if (flags & BTRFS_INODE_SYNC) iflags |= FS_SYNC_FL; @@ -167,25 +167,24 @@ static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode) /* * Update inode->i_flags based on the btrfs internal flags. */ -void btrfs_sync_inode_flags_to_i_flags(struct inode *inode) +void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode) { - struct btrfs_inode *binode = BTRFS_I(inode); unsigned int new_fl = 0; - if (binode->flags & BTRFS_INODE_SYNC) + if (inode->flags & BTRFS_INODE_SYNC) new_fl |= S_SYNC; - if (binode->flags & BTRFS_INODE_IMMUTABLE) + if (inode->flags & BTRFS_INODE_IMMUTABLE) new_fl |= S_IMMUTABLE; - if (binode->flags & BTRFS_INODE_APPEND) + if (inode->flags & BTRFS_INODE_APPEND) new_fl |= S_APPEND; - if (binode->flags & BTRFS_INODE_NOATIME) + if (inode->flags & BTRFS_INODE_NOATIME) new_fl |= S_NOATIME; - if (binode->flags & BTRFS_INODE_DIRSYNC) + if (inode->flags & BTRFS_INODE_DIRSYNC) new_fl |= S_DIRSYNC; - if (binode->ro_flags & BTRFS_INODE_RO_VERITY) + if (inode->ro_flags & BTRFS_INODE_RO_VERITY) new_fl |= S_VERITY; - set_mask_bits(&inode->i_flags, + set_mask_bits(&inode->vfs_inode.i_flags, S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC | S_VERITY, new_fl); } @@ -219,7 +218,7 @@ static int check_fsflags(unsigned int old_flags, unsigned int flags) return 0; } -static int check_fsflags_compatible(struct btrfs_fs_info *fs_info, +static int check_fsflags_compatible(const struct btrfs_fs_info *fs_info, unsigned int flags) { if (btrfs_is_zoned(fs_info) && (flags & FS_NOCOW_FL)) @@ -246,26 +245,25 @@ static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_ * Set flags/xflags from the internal inode flags. The remaining items of * fsxattr are zeroed. */ -int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int btrfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { - struct btrfs_inode *binode = BTRFS_I(d_inode(dentry)); + const struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); - fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode)); + fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(inode)); return 0; } int btrfs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { - struct inode *inode = d_inode(dentry); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_inode *binode = BTRFS_I(inode); - struct btrfs_root *root = binode->root; + struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; unsigned int fsflags, old_fsflags; int ret; const char *comp = NULL; - u32 binode_flags; + u32 inode_flags; if (btrfs_root_readonly(root)) return -EROFS; @@ -273,8 +271,8 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; - fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags); - old_fsflags = btrfs_inode_flags_to_fsflags(binode); + fsflags = btrfs_mask_fsflags_for_type(&inode->vfs_inode, fa->flags); + old_fsflags = btrfs_inode_flags_to_fsflags(inode); ret = check_fsflags(old_fsflags, fsflags); if (ret) return ret; @@ -283,27 +281,27 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, if (ret) return ret; - binode_flags = binode->flags; + inode_flags = inode->flags; if (fsflags & FS_SYNC_FL) - binode_flags |= BTRFS_INODE_SYNC; + inode_flags |= BTRFS_INODE_SYNC; else - binode_flags &= ~BTRFS_INODE_SYNC; + inode_flags &= ~BTRFS_INODE_SYNC; if (fsflags & FS_IMMUTABLE_FL) - binode_flags |= BTRFS_INODE_IMMUTABLE; + inode_flags |= BTRFS_INODE_IMMUTABLE; else - binode_flags &= ~BTRFS_INODE_IMMUTABLE; + inode_flags &= ~BTRFS_INODE_IMMUTABLE; if (fsflags & FS_APPEND_FL) - binode_flags |= BTRFS_INODE_APPEND; + inode_flags |= BTRFS_INODE_APPEND; else - binode_flags &= ~BTRFS_INODE_APPEND; + inode_flags &= ~BTRFS_INODE_APPEND; if (fsflags & FS_NODUMP_FL) - binode_flags |= BTRFS_INODE_NODUMP; + inode_flags |= BTRFS_INODE_NODUMP; else - binode_flags &= ~BTRFS_INODE_NODUMP; + inode_flags &= ~BTRFS_INODE_NODUMP; if (fsflags & FS_NOATIME_FL) - binode_flags |= BTRFS_INODE_NOATIME; + inode_flags |= BTRFS_INODE_NOATIME; else - binode_flags &= ~BTRFS_INODE_NOATIME; + inode_flags &= ~BTRFS_INODE_NOATIME; /* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */ if (!fa->flags_valid) { @@ -315,32 +313,32 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, } if (fsflags & FS_DIRSYNC_FL) - binode_flags |= BTRFS_INODE_DIRSYNC; + inode_flags |= BTRFS_INODE_DIRSYNC; else - binode_flags &= ~BTRFS_INODE_DIRSYNC; + inode_flags &= ~BTRFS_INODE_DIRSYNC; if (fsflags & FS_NOCOW_FL) { - if (S_ISREG(inode->i_mode)) { + if (S_ISREG(inode->vfs_inode.i_mode)) { /* * It's safe to turn csums off here, no extents exist. * Otherwise we want the flag to reflect the real COW * status of the file and will not set it. */ - if (inode->i_size == 0) - binode_flags |= BTRFS_INODE_NODATACOW | - BTRFS_INODE_NODATASUM; + if (inode->vfs_inode.i_size == 0) + inode_flags |= BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM; } else { - binode_flags |= BTRFS_INODE_NODATACOW; + inode_flags |= BTRFS_INODE_NODATACOW; } } else { /* * Revert back under same assumptions as above */ - if (S_ISREG(inode->i_mode)) { - if (inode->i_size == 0) - binode_flags &= ~(BTRFS_INODE_NODATACOW | - BTRFS_INODE_NODATASUM); + if (S_ISREG(inode->vfs_inode.i_mode)) { + if (inode->vfs_inode.i_size == 0) + inode_flags &= ~(BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM); } else { - binode_flags &= ~BTRFS_INODE_NODATACOW; + inode_flags &= ~BTRFS_INODE_NODATACOW; } } @@ -350,21 +348,21 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, * things smaller. */ if (fsflags & FS_NOCOMP_FL) { - binode_flags &= ~BTRFS_INODE_COMPRESS; - binode_flags |= BTRFS_INODE_NOCOMPRESS; + inode_flags &= ~BTRFS_INODE_COMPRESS; + inode_flags |= BTRFS_INODE_NOCOMPRESS; } else if (fsflags & FS_COMPR_FL) { - if (IS_SWAPFILE(inode)) + if (IS_SWAPFILE(&inode->vfs_inode)) return -ETXTBSY; - binode_flags |= BTRFS_INODE_COMPRESS; - binode_flags &= ~BTRFS_INODE_NOCOMPRESS; + inode_flags |= BTRFS_INODE_COMPRESS; + inode_flags &= ~BTRFS_INODE_NOCOMPRESS; comp = btrfs_compress_type2str(fs_info->compress_type); if (!comp || comp[0] == 0) comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB); } else { - binode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); + inode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } /* @@ -376,15 +374,14 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, return PTR_ERR(trans); if (comp) { - ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression", + ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp, strlen(comp), 0); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } } else { - ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression", - NULL, 0, 0); + ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, 0, 0); if (ret && ret != -ENODATA) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -392,98 +389,19 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, } update_flags: - binode->flags = binode_flags; + inode->flags = inode_flags; + btrfs_update_inode_mapping_flags(inode); btrfs_sync_inode_flags_to_i_flags(inode); - inode_inc_iversion(inode); - inode_set_ctime_current(inode); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + inode_inc_iversion(&inode->vfs_inode); + inode_set_ctime_current(&inode->vfs_inode); + ret = btrfs_update_inode(trans, inode); out_end_trans: btrfs_end_transaction(trans); return ret; } -/* - * Start exclusive operation @type, return true on success - */ -bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation type) -{ - bool ret = false; - - spin_lock(&fs_info->super_lock); - if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) { - fs_info->exclusive_operation = type; - ret = true; - } - spin_unlock(&fs_info->super_lock); - - return ret; -} - -/* - * Conditionally allow to enter the exclusive operation in case it's compatible - * with the running one. This must be paired with btrfs_exclop_start_unlock and - * btrfs_exclop_finish. - * - * Compatibility: - * - the same type is already running - * - when trying to add a device and balance has been paused - * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller - * must check the condition first that would allow none -> @type - */ -bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation type) -{ - spin_lock(&fs_info->super_lock); - if (fs_info->exclusive_operation == type || - (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED && - type == BTRFS_EXCLOP_DEV_ADD)) - return true; - - spin_unlock(&fs_info->super_lock); - return false; -} - -void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info) -{ - spin_unlock(&fs_info->super_lock); -} - -void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) -{ - spin_lock(&fs_info->super_lock); - WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); - spin_unlock(&fs_info->super_lock); - sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); -} - -void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation op) -{ - switch (op) { - case BTRFS_EXCLOP_BALANCE_PAUSED: - spin_lock(&fs_info->super_lock); - ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE || - fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD || - fs_info->exclusive_operation == BTRFS_EXCLOP_NONE || - fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); - fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED; - spin_unlock(&fs_info->super_lock); - break; - case BTRFS_EXCLOP_BALANCE: - spin_lock(&fs_info->super_lock); - ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); - fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; - spin_unlock(&fs_info->super_lock); - break; - default: - btrfs_warn(fs_info, - "invalid exclop balance operation %d requested", op); - } -} - -static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg) +static int btrfs_ioctl_getversion(const struct inode *inode, int __user *arg) { return put_user(inode->i_generation, arg); } @@ -551,22 +469,11 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, return ret; } -int __pure btrfs_is_empty_uuid(const u8 *uuid) -{ - int i; - - for (i = 0; i < BTRFS_UUID_SIZE; i++) { - if (uuid[i]) - return 0; - } - return 1; -} - /* * Calculate the number of transaction items to reserve for creating a subvolume * or snapshot, not including the inode, directory entries, or parent directory. */ -static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit) +static unsigned int create_subvol_num_items(const struct btrfs_qgroup_inherit *inherit) { /* * 1 to add root block @@ -708,8 +615,8 @@ static noinline int create_subvol(struct mnt_idmap *idmap, btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID); key.objectid = objectid; - key.offset = 0; key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = 0; ret = btrfs_insert_root(trans, fs_info->tree_root, &key, root_item); if (ret) { @@ -759,14 +666,14 @@ static noinline int create_subvol(struct mnt_idmap *idmap, goto out; } + btrfs_record_new_subvolume(trans, BTRFS_I(dir)); + ret = btrfs_create_new_inode(trans, &new_inode_args); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } - btrfs_record_new_subvolume(trans, BTRFS_I(dir)); - d_instantiate_new(dentry, new_inode_args.inode); new_inode_args.inode = NULL; @@ -934,7 +841,7 @@ free_pending: static int btrfs_may_delete(struct mnt_idmap *idmap, struct inode *dir, struct dentry *victim, int isdir) { - int error; + int ret; if (d_really_is_negative(victim)) return -ENOENT; @@ -944,9 +851,9 @@ static int btrfs_may_delete(struct mnt_idmap *idmap, return -EINVAL; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); - error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); - if (error) - return error; + ret = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); + if (ret) + return ret; if (IS_APPEND(dir)) return -EPERM; if (check_sticky(idmap, dir, d_inode(victim)) || @@ -969,7 +876,7 @@ static int btrfs_may_delete(struct mnt_idmap *idmap, /* copy of may_create in fs/namei.c() */ static inline int btrfs_may_create(struct mnt_idmap *idmap, - struct inode *dir, struct dentry *child) + struct inode *dir, const struct dentry *child) { if (d_really_is_positive(child)) return -EEXIST; @@ -985,39 +892,37 @@ static inline int btrfs_may_create(struct mnt_idmap *idmap, * sys_mkdirat and vfs_mkdir, but we only do a single component lookup * inside this filesystem so it's quite a bit simpler. */ -static noinline int btrfs_mksubvol(const struct path *parent, +static noinline int btrfs_mksubvol(struct dentry *parent, struct mnt_idmap *idmap, - const char *name, int namelen, - struct btrfs_root *snap_src, + struct qstr *qname, struct btrfs_root *snap_src, bool readonly, struct btrfs_qgroup_inherit *inherit) { - struct inode *dir = d_inode(parent->dentry); + struct inode *dir = d_inode(parent); struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct dentry *dentry; - struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen); - int error; + struct fscrypt_str name_str = FSTR_INIT((char *)qname->name, qname->len); + int ret; - error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); - if (error == -EINTR) - return error; + ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); + if (ret == -EINTR) + return ret; - dentry = lookup_one(idmap, name, parent->dentry, namelen); - error = PTR_ERR(dentry); + dentry = lookup_one(idmap, qname, parent); + ret = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_unlock; - error = btrfs_may_create(idmap, dir, dentry); - if (error) + ret = btrfs_may_create(idmap, dir, dentry); + if (ret) goto out_dput; /* * even if this name doesn't exist, we may get hash collisions. * check for them now when we can safely fail */ - error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, - dir->i_ino, &name_str); - if (error) + ret = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, dir->i_ino, &name_str); + if (ret) goto out_dput; down_read(&fs_info->subvol_sem); @@ -1026,11 +931,11 @@ static noinline int btrfs_mksubvol(const struct path *parent, goto out_up_read; if (snap_src) - error = create_snapshot(snap_src, dir, dentry, readonly, inherit); + ret = create_snapshot(snap_src, dir, dentry, readonly, inherit); else - error = create_subvol(idmap, dir, dentry, inherit); + ret = create_subvol(idmap, dir, dentry, inherit); - if (!error) + if (!ret) fsnotify_mkdir(dir, dentry); out_up_read: up_read(&fs_info->subvol_sem); @@ -1038,12 +943,12 @@ out_dput: dput(dentry); out_unlock: btrfs_inode_unlock(BTRFS_I(dir), 0); - return error; + return ret; } -static noinline int btrfs_mksnapshot(const struct path *parent, +static noinline int btrfs_mksnapshot(struct dentry *parent, struct mnt_idmap *idmap, - const char *name, int namelen, + struct qstr *qname, struct btrfs_root *root, bool readonly, struct btrfs_qgroup_inherit *inherit) @@ -1070,8 +975,8 @@ static noinline int btrfs_mksnapshot(const struct path *parent, btrfs_wait_ordered_extents(root, U64_MAX, NULL); - ret = btrfs_mksubvol(parent, idmap, name, namelen, - root, readonly, inherit); + ret = btrfs_mksubvol(parent, idmap, qname, root, readonly, inherit); + atomic_dec(&root->snapshot_force_cow); out: btrfs_drew_read_unlock(&root->snapshot_lock); @@ -1124,17 +1029,14 @@ static noinline int btrfs_ioctl_resize(struct file *file, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_fs_info *fs_info = root->fs_info; u64 new_size; u64 old_size; u64 devid = 1; - struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_vol_args *vol_args; - struct btrfs_trans_handle *trans; struct btrfs_device *device = NULL; char *sizestr; - char *retptr; char *devstr = NULL; int ret = 0; int mod = 0; @@ -1202,6 +1104,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (!strcmp(sizestr, "max")) new_size = bdev_nr_bytes(device->bdev); else { + char *retptr; + if (sizestr[0] == '-') { mod = -1; sizestr++; @@ -1249,6 +1153,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size = round_down(new_size, fs_info->sectorsize); if (new_size > old_size) { + struct btrfs_trans_handle *trans; + trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -1261,7 +1167,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, } /* equal, nothing need to do */ if (ret == 0 && new_size != old_size) - btrfs_info_in_rcu(fs_info, + btrfs_info(fs_info, "resize device %s (devid %llu) from %llu to %llu", btrfs_dev_name(device), device->devid, old_size, new_size); @@ -1276,12 +1182,12 @@ out_drop: static noinline int __btrfs_ioctl_snap_create(struct file *file, struct mnt_idmap *idmap, - const char *name, unsigned long fd, int subvol, + const char *name, unsigned long fd, bool subvol, bool readonly, struct btrfs_qgroup_inherit *inherit) { - int namelen; int ret = 0; + struct qstr qname = QSTR_INIT(name, strlen(name)); if (!S_ISDIR(file_inode(file)->i_mode)) return -ENOTDIR; @@ -1290,21 +1196,20 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, if (ret) goto out; - namelen = strlen(name); if (strchr(name, '/')) { ret = -EINVAL; goto out_drop_write; } - if (name[0] == '.' && - (namelen == 1 || (name[1] == '.' && namelen == 2))) { + if (qname.name[0] == '.' && + (qname.len == 1 || (qname.name[1] == '.' && qname.len == 2))) { ret = -EEXIST; goto out_drop_write; } if (subvol) { - ret = btrfs_mksubvol(&file->f_path, idmap, name, - namelen, NULL, readonly, inherit); + ret = btrfs_mksubvol(file_dentry(file), idmap, &qname, NULL, + readonly, inherit); } else { CLASS(fd, src)(fd); struct inode *src_inode; @@ -1334,8 +1239,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, */ ret = -EINVAL; } else { - ret = btrfs_mksnapshot(&file->f_path, idmap, - name, namelen, + ret = btrfs_mksnapshot(file_dentry(file), idmap, &qname, BTRFS_I(src_inode)->root, readonly, inherit); } @@ -1372,7 +1276,7 @@ out: } static noinline int btrfs_ioctl_snap_create_v2(struct file *file, - void __user *arg, int subvol) + void __user *arg, bool subvol) { struct btrfs_ioctl_vol_args_v2 *vol_args; int ret; @@ -1427,15 +1331,15 @@ free_args: return ret; } -static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode, +static noinline int btrfs_ioctl_subvol_getflags(struct btrfs_inode *inode, void __user *arg) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; u64 flags = 0; - if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) return -EINVAL; down_read(&fs_info->subvol_sem); @@ -1538,8 +1442,8 @@ out: return ret; } -static noinline int key_in_sk(struct btrfs_key *key, - struct btrfs_ioctl_search_key *sk) +static noinline bool key_in_sk(const struct btrfs_key *key, + const struct btrfs_ioctl_search_key *sk) { struct btrfs_key test; int ret; @@ -1550,7 +1454,7 @@ static noinline int key_in_sk(struct btrfs_key *key, ret = btrfs_comp_cpu_keys(key, &test); if (ret < 0) - return 0; + return false; test.objectid = sk->max_objectid; test.type = sk->max_type; @@ -1558,13 +1462,13 @@ static noinline int key_in_sk(struct btrfs_key *key, ret = btrfs_comp_cpu_keys(key, &test); if (ret > 0) - return 0; - return 1; + return false; + return true; } static noinline int copy_to_sk(struct btrfs_path *path, struct btrfs_key *key, - struct btrfs_ioctl_search_key *sk, + const struct btrfs_ioctl_search_key *sk, u64 *buf_size, char __user *ubuf, unsigned long *sk_offset, @@ -1621,8 +1525,8 @@ static noinline int copy_to_sk(struct btrfs_path *path, } sh.objectid = key->objectid; - sh.offset = key->offset; sh.type = key->type; + sh.offset = key->offset; sh.len = item_len; sh.transid = found_transid; @@ -1695,13 +1599,12 @@ out: return ret; } -static noinline int search_ioctl(struct inode *inode, +static noinline int search_ioctl(struct btrfs_root *root, struct btrfs_ioctl_search_key *sk, u64 *buf_size, char __user *ubuf) { - struct btrfs_fs_info *info = inode_to_fs_info(inode); - struct btrfs_root *root; + struct btrfs_fs_info *info = root->fs_info; struct btrfs_key key; struct btrfs_path *path; int ret; @@ -1718,9 +1621,10 @@ static noinline int search_ioctl(struct inode *inode, return -ENOMEM; if (sk->tree_id == 0) { - /* search the root of the inode that was passed */ - root = btrfs_grab_root(BTRFS_I(inode)->root); + /* Search the root that we got passed. */ + root = btrfs_grab_root(root); } else { + /* Look up the root from the arguments. */ root = btrfs_get_fs_root(info, sk->tree_id, true); if (IS_ERR(root)) { btrfs_free_path(path); @@ -1733,21 +1637,19 @@ static noinline int search_ioctl(struct inode *inode, key.offset = sk->min_offset; while (1) { - ret = -EFAULT; /* * Ensure that the whole user buffer is faulted in at sub-page * granularity, otherwise the loop may live-lock. */ - if (fault_in_subpage_writeable(ubuf + sk_offset, - *buf_size - sk_offset)) + if (fault_in_subpage_writeable(ubuf + sk_offset, *buf_size - sk_offset)) { + ret = -EFAULT; break; + } ret = btrfs_search_forward(root, &key, path, sk->min_transid); - if (ret != 0) { - if (ret > 0) - ret = 0; - goto err; - } + if (ret) + break; + ret = copy_to_sk(path, &key, sk, buf_size, ubuf, &sk_offset, &num_found); btrfs_release_path(path); @@ -1755,16 +1657,17 @@ static noinline int search_ioctl(struct inode *inode, break; } + /* Normalize return values from btrfs_search_forward() and copy_to_sk(). */ if (ret > 0) ret = 0; -err: + sk->nr_items = num_found; btrfs_put_root(root); btrfs_free_path(path); return ret; } -static noinline int btrfs_ioctl_tree_search(struct inode *inode, +static noinline int btrfs_ioctl_tree_search(struct btrfs_root *root, void __user *argp) { struct btrfs_ioctl_search_args __user *uargs = argp; @@ -1780,7 +1683,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode, buf_size = sizeof(uargs->buf); - ret = search_ioctl(inode, &sk, &buf_size, uargs->buf); + ret = search_ioctl(root, &sk, &buf_size, uargs->buf); /* * In the origin implementation an overflow is handled by returning a @@ -1794,7 +1697,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode, return ret; } -static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, +static noinline int btrfs_ioctl_tree_search_v2(struct btrfs_root *root, void __user *argp) { struct btrfs_ioctl_search_args_v2 __user *uarg = argp; @@ -1816,7 +1719,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, if (buf_size > buf_limit) buf_size = buf_limit; - ret = search_ioctl(inode, &args.key, &buf_size, + ret = search_ioctl(root, &args.key, &buf_size, (char __user *)(&uarg->buf[0])); if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) ret = -EFAULT; @@ -1924,7 +1827,6 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, struct btrfs_path *path; struct btrfs_key key, key2; struct extent_buffer *leaf; - struct inode *temp_inode; char *ptr; int slot; int len; @@ -1952,6 +1854,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; while (1) { + struct btrfs_inode *temp_inode; + ret = btrfs_search_backwards(root, &key, path); if (ret < 0) goto out_put; @@ -2006,9 +1910,9 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, ret = PTR_ERR(temp_inode); goto out_put; } - ret = inode_permission(idmap, temp_inode, + ret = inode_permission(idmap, &temp_inode->vfs_inode, MAY_READ | MAY_EXEC); - iput(temp_inode); + iput(&temp_inode->vfs_inode); if (ret) { ret = -EACCES; goto out_put; @@ -2380,7 +2284,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL; struct mnt_idmap *idmap = file_mnt_idmap(file); char *subvol_name, *subvol_name_ptr = NULL; - int subvol_namelen; int ret = 0; bool destroy_parent = false; @@ -2503,10 +2406,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out; } - subvol_namelen = strlen(subvol_name); - if (strchr(subvol_name, '/') || - strncmp(subvol_name, "..", subvol_namelen) == 0) { + strcmp(subvol_name, "..") == 0) { ret = -EINVAL; goto free_subvol_name; } @@ -2519,7 +2420,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); if (ret == -EINTR) goto free_subvol_name; - dentry = lookup_one(idmap, subvol_name, parent, subvol_namelen); + dentry = lookup_one(idmap, &QSTR(subvol_name), parent); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); goto out_unlock_dir; @@ -2635,6 +2536,15 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) goto out; } + /* + * Don't allow defrag on pre-content watched files, as it could + * populate the page cache with 0's via readahead. + */ + if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) { + ret = -EINVAL; + goto out; + } + if (argp) { if (copy_from_user(&range, argp, sizeof(range))) { ret = -EFAULT; @@ -2644,8 +2554,14 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EOPNOTSUPP; goto out; } - /* compression requires us to start the IO */ - if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { + if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS) && + (range.flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS)) { + ret = -EINVAL; + goto out; + } + /* Compression or no-compression require to start the IO. */ + if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS) || + (range.flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS)) { range.flags |= BTRFS_DEFRAG_RANGE_START_IO; range.extent_thresh = (u32)-1; } @@ -2653,7 +2569,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) /* the rest are all set to zero by kzalloc */ range.len = (u64)-1; } - ret = btrfs_defrag_file(file_inode(file), &file->f_ra, + ret = btrfs_defrag_file(BTRFS_I(file_inode(file)), &file->f_ra, &range, BTRFS_OLDEST_GENERATION, 0); if (ret > 0) ret = 0; @@ -2786,7 +2702,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) err_drop: mnt_drop_write_file(file); if (bdev_file) - fput(bdev_file); + bdev_fput(bdev_file); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); @@ -2837,7 +2753,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) mnt_drop_write_file(file); if (bdev_file) - fput(bdev_file); + bdev_fput(bdev_file); out: btrfs_put_dev_args_from_path(&args); out_free: @@ -2845,7 +2761,7 @@ out_free: return ret; } -static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, +static long btrfs_ioctl_fs_info(const struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_fs_info_args *fi_args; @@ -2899,7 +2815,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, return ret; } -static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, +static long btrfs_ioctl_dev_info(const struct btrfs_fs_info *fs_info, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); @@ -2976,7 +2892,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) ret = PTR_ERR(new_root); goto out; } - if (!is_fstree(btrfs_root_id(new_root))) { + if (!btrfs_is_fstree(btrfs_root_id(new_root))) { ret = -ENOENT; goto out_free; } @@ -3007,7 +2923,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); btrfs_set_dir_item_key(path->nodes[0], di, &disk_key); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL); @@ -3226,7 +3141,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg) return -EPERM; if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { - btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet"); + btrfs_err(fs_info, "scrub: extent tree v2 not yet supported"); return -EINVAL; } @@ -3444,7 +3359,6 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, int size; struct btrfs_ioctl_logical_ino_args *loi; struct btrfs_data_container *inodes = NULL; - struct btrfs_path *path = NULL; bool ignore_offset; if (!capable(CAP_SYS_ADMIN)) @@ -3478,14 +3392,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, goto out_loi; } - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - ret = iterate_inodes_from_logical(loi->logical, fs_info, path, - inodes, ignore_offset); - btrfs_free_path(path); + ret = iterate_inodes_from_logical(loi->logical, fs_info, inodes, ignore_offset); if (ret == -EINVAL) ret = -ENOENT; if (ret < 0) @@ -3802,22 +3709,6 @@ drop_write: return ret; } -/* - * Quick check for ioctl handlers if quotas are enabled. Proper locking must be - * done before any operations. - */ -static bool qgroup_enabled(struct btrfs_fs_info *fs_info) -{ - bool ret = true; - - mutex_lock(&fs_info->qgroup_ioctl_lock); - if (!fs_info->quota_root) - ret = false; - mutex_unlock(&fs_info->qgroup_ioctl_lock); - - return ret; -} - static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); @@ -3832,7 +3723,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!qgroup_enabled(root->fs_info)) + if (!btrfs_qgroup_enabled(fs_info)) return -ENOTCONN; ret = mnt_want_write_file(file); @@ -3902,7 +3793,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!qgroup_enabled(root->fs_info)) + if (!btrfs_qgroup_enabled(root->fs_info)) return -ENOTCONN; ret = mnt_want_write_file(file); @@ -3920,7 +3811,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) goto out; } - if (sa->create && is_fstree(sa->qgroupid)) { + if (sa->create && btrfs_is_fstree(sa->qgroupid)) { ret = -EINVAL; goto out; } @@ -3961,7 +3852,7 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!qgroup_enabled(root->fs_info)) + if (!btrfs_qgroup_enabled(root->fs_info)) return -ENOTCONN; ret = mnt_want_write_file(file); @@ -4009,7 +3900,7 @@ static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!qgroup_enabled(fs_info)) + if (!btrfs_qgroup_enabled(fs_info)) return -ENOTCONN; ret = mnt_want_write_file(file); @@ -4287,7 +4178,7 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) } spin_lock(&fs_info->super_lock); - strcpy(super_block->label, label); + strscpy(super_block->label, label); spin_unlock(&fs_info->super_lock); ret = btrfs_commit_transaction(trans); @@ -4331,7 +4222,7 @@ static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info, return 0; } -static int check_feature_bits(struct btrfs_fs_info *fs_info, +static int check_feature_bits(const struct btrfs_fs_info *fs_info, enum btrfs_feature_set set, u64 change_mask, u64 flags, u64 supported_flags, u64 safe_set, u64 safe_clear) @@ -4467,7 +4358,7 @@ out_drop_write: return ret; } -static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool compat) +static int _btrfs_ioctl_send(struct btrfs_root *root, void __user *argp, bool compat) { struct btrfs_ioctl_send_args *arg; int ret; @@ -4498,7 +4389,7 @@ static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool if (IS_ERR(arg)) return PTR_ERR(arg); } - ret = btrfs_ioctl_send(inode, arg); + ret = btrfs_ioctl_send(root, arg); kfree(arg); return ret; } @@ -4594,7 +4485,7 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, args.compression, &unlocked); if (!unlocked) { - unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); } } @@ -4716,6 +4607,13 @@ out_acct: return ret; } +struct btrfs_uring_encoded_data { + struct btrfs_ioctl_encoded_io_args args; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov; + struct iov_iter iter; +}; + /* * Context that's attached to an encoded read io_uring command, in cmd->pdu. It * contains the fields in btrfs_uring_read_extent that are necessary to finish @@ -4737,6 +4635,7 @@ struct btrfs_uring_priv { }; struct io_btrfs_cmd { + struct btrfs_uring_encoded_data *data; struct btrfs_uring_priv *priv; }; @@ -4746,11 +4645,14 @@ static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int iss struct btrfs_uring_priv *priv = bc->priv; struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp)); struct extent_io_tree *io_tree = &inode->io_tree; - unsigned long index; + pgoff_t index; u64 cur; size_t page_offset; ssize_t ret; + /* The inode lock has already been acquired in btrfs_uring_read_extent. */ + btrfs_lockdep_inode_acquire(inode, i_rwsem); + if (priv->err) { ret = priv->err; goto out; @@ -4780,7 +4682,7 @@ static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int iss ret = priv->count; out: - unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); + btrfs_unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); io_uring_cmd_done(cmd, ret, 0, issue_flags); @@ -4792,6 +4694,7 @@ out: kfree(priv->pages); kfree(priv->iov); kfree(priv); + kfree(bc->data); } void btrfs_uring_read_extent_endio(void *ctx, int err) @@ -4859,10 +4762,17 @@ static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter, * and inode and freeing the allocations. */ + /* + * We're returning to userspace with the inode lock held, and that's + * okay - it'll get unlocked in a worker thread. Call + * btrfs_lockdep_inode_release() to avoid confusing lockdep. + */ + btrfs_lockdep_inode_release(inode, i_rwsem); + return -EIOCBQUEUED; out_fail: - unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); kfree(priv); return ret; @@ -4872,21 +4782,22 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue { size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); size_t copy_end; - struct btrfs_ioctl_encoded_io_args args = { 0 }; int ret; u64 disk_bytenr, disk_io_size; struct file *file; struct btrfs_inode *inode; struct btrfs_fs_info *fs_info; struct extent_io_tree *io_tree; - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - struct iov_iter iter; loff_t pos; struct kiocb kiocb; struct extent_state *cached_state = NULL; u64 start, lockend; void __user *sqe_addr; + struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); + struct btrfs_uring_encoded_data *data = NULL; + + if (cmd->flags & IORING_URING_CMD_REISSUE) + data = bc->data; if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; @@ -4900,43 +4811,65 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue if (issue_flags & IO_URING_F_COMPAT) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) - struct btrfs_ioctl_encoded_io_args_32 args32; - copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags); - if (copy_from_user(&args32, sqe_addr, copy_end)) { - ret = -EFAULT; - goto out_acct; - } - args.iov = compat_ptr(args32.iov); - args.iovcnt = args32.iovcnt; - args.offset = args32.offset; - args.flags = args32.flags; #else - return -ENOTTY; + ret = -ENOTTY; + goto out_acct; #endif } else { copy_end = copy_end_kernel; - if (copy_from_user(&args, sqe_addr, copy_end)) { - ret = -EFAULT; + } + + if (!data) { + data = kzalloc(sizeof(*data), GFP_NOFS); + if (!data) { + ret = -ENOMEM; goto out_acct; } - } - if (args.flags != 0) - return -EINVAL; + bc->data = data; - ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), - &iov, &iter); - if (ret < 0) - goto out_acct; + if (issue_flags & IO_URING_F_COMPAT) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; - if (iov_iter_count(&iter) == 0) { - ret = 0; - goto out_free; + if (copy_from_user(&args32, sqe_addr, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + + data->args.iov = compat_ptr(args32.iov); + data->args.iovcnt = args32.iovcnt; + data->args.offset = args32.offset; + data->args.flags = args32.flags; +#endif + } else { + if (copy_from_user(&data->args, sqe_addr, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + } + + if (data->args.flags != 0) { + ret = -EINVAL; + goto out_acct; + } + + data->iov = data->iovstack; + ret = import_iovec(ITER_DEST, data->args.iov, data->args.iovcnt, + ARRAY_SIZE(data->iovstack), &data->iov, + &data->iter); + if (ret < 0) + goto out_acct; + + if (iov_iter_count(&data->iter) == 0) { + ret = 0; + goto out_free; + } } - pos = args.offset; - ret = rw_verify_area(READ, file, &pos, args.len); + pos = data->args.offset; + ret = rw_verify_area(READ, file, &pos, data->args.len); if (ret < 0) goto out_free; @@ -4949,17 +4882,20 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue start = ALIGN_DOWN(pos, fs_info->sectorsize); lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; - ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state, + ret = btrfs_encoded_read(&kiocb, &data->iter, &data->args, &cached_state, &disk_bytenr, &disk_io_size); + if (ret == -EAGAIN) + goto out_acct; if (ret < 0 && ret != -EIOCBQUEUED) goto out_free; file_accessed(file); - if (copy_to_user(sqe_addr + copy_end, (const char *)&args + copy_end_kernel, - sizeof(args) - copy_end_kernel)) { + if (copy_to_user(sqe_addr + copy_end, + (const char *)&data->args + copy_end_kernel, + sizeof(data->args) - copy_end_kernel)) { if (ret == -EIOCBQUEUED) { - unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); } ret = -EFAULT; @@ -4967,46 +4903,160 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue } if (ret == -EIOCBQUEUED) { - u64 count; + u64 count = min_t(u64, iov_iter_count(&data->iter), disk_io_size); - /* - * If we've optimized things by storing the iovecs on the stack, - * undo this. - */ - if (!iov) { - iov = kmalloc(sizeof(struct iovec) * args.iovcnt, GFP_NOFS); - if (!iov) { - unlock_extent(io_tree, start, lockend, &cached_state); - btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); - ret = -ENOMEM; + /* Match ioctl by not returning past EOF if uncompressed. */ + if (!data->args.compression) + count = min_t(u64, count, data->args.len); + + ret = btrfs_uring_read_extent(&kiocb, &data->iter, start, lockend, + cached_state, disk_bytenr, disk_io_size, + count, data->args.compression, + data->iov, cmd); + + goto out_acct; + } + +out_free: + kfree(data->iov); + +out_acct: + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + + if (ret != -EIOCBQUEUED && ret != -EAGAIN) + kfree(data); + + return ret; +} + +static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + loff_t pos; + struct kiocb kiocb; + struct file *file; + ssize_t ret; + void __user *sqe_addr; + struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); + struct btrfs_uring_encoded_data *data = NULL; + + if (cmd->flags & IORING_URING_CMD_REISSUE) + data = bc->data; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_acct; + } + + file = cmd->file; + sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); + + if (!(file->f_mode & FMODE_WRITE)) { + ret = -EBADF; + goto out_acct; + } + + if (!data) { + data = kzalloc(sizeof(*data), GFP_NOFS); + if (!data) { + ret = -ENOMEM; + goto out_acct; + } + + bc->data = data; + + if (issue_flags & IO_URING_F_COMPAT) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + if (copy_from_user(&args32, sqe_addr, sizeof(args32))) { + ret = -EFAULT; + goto out_acct; + } + data->args.iov = compat_ptr(args32.iov); + data->args.iovcnt = args32.iovcnt; + data->args.offset = args32.offset; + data->args.flags = args32.flags; + data->args.len = args32.len; + data->args.unencoded_len = args32.unencoded_len; + data->args.unencoded_offset = args32.unencoded_offset; + data->args.compression = args32.compression; + data->args.encryption = args32.encryption; + memcpy(data->args.reserved, args32.reserved, + sizeof(data->args.reserved)); +#else + ret = -ENOTTY; + goto out_acct; +#endif + } else { + if (copy_from_user(&data->args, sqe_addr, sizeof(data->args))) { + ret = -EFAULT; goto out_acct; } - - memcpy(iov, iovstack, sizeof(struct iovec) * args.iovcnt); } - count = min_t(u64, iov_iter_count(&iter), disk_io_size); + ret = -EINVAL; + if (data->args.flags != 0) + goto out_acct; + if (memchr_inv(data->args.reserved, 0, sizeof(data->args.reserved))) + goto out_acct; + if (data->args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE && + data->args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE) + goto out_acct; + if (data->args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES || + data->args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES) + goto out_acct; + if (data->args.unencoded_offset > data->args.unencoded_len) + goto out_acct; + if (data->args.len > data->args.unencoded_len - data->args.unencoded_offset) + goto out_acct; - /* Match ioctl by not returning past EOF if uncompressed. */ - if (!args.compression) - count = min_t(u64, count, args.len); + data->iov = data->iovstack; + ret = import_iovec(ITER_SOURCE, data->args.iov, data->args.iovcnt, + ARRAY_SIZE(data->iovstack), &data->iov, + &data->iter); + if (ret < 0) + goto out_acct; - ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend, - cached_state, disk_bytenr, - disk_io_size, count, - args.compression, iov, cmd); + if (iov_iter_count(&data->iter) == 0) { + ret = 0; + goto out_iov; + } + } + if (issue_flags & IO_URING_F_NONBLOCK) { + ret = -EAGAIN; goto out_acct; } -out_free: - kfree(iov); + pos = data->args.offset; + ret = rw_verify_area(WRITE, file, &pos, data->args.len); + if (ret < 0) + goto out_iov; + init_sync_kiocb(&kiocb, file); + ret = kiocb_set_rw_flags(&kiocb, 0, WRITE); + if (ret) + goto out_iov; + kiocb.ki_pos = pos; + + file_start_write(file); + + ret = btrfs_do_write_iter(&kiocb, &data->iter, &data->args); + if (ret > 0) + fsnotify_modify(file); + + file_end_write(file); +out_iov: + kfree(data->iov); out_acct: if (ret > 0) - add_rchar(current, ret); - inc_syscr(current); + add_wchar(current, ret); + inc_syscw(current); + if (ret != -EAGAIN) + kfree(data); return ret; } @@ -5018,6 +5068,12 @@ int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) case BTRFS_IOC_ENCODED_READ_32: #endif return btrfs_uring_encoded_read(cmd, issue_flags); + + case BTRFS_IOC_ENCODED_WRITE: +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + case BTRFS_IOC_ENCODED_WRITE_32: +#endif + return btrfs_uring_encoded_write(cmd, issue_flags); } return -EINVAL; @@ -5179,7 +5235,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SNAP_DESTROY_V2: return btrfs_ioctl_snap_destroy(file, argp, true); case BTRFS_IOC_SUBVOL_GETFLAGS: - return btrfs_ioctl_subvol_getflags(inode, argp); + return btrfs_ioctl_subvol_getflags(BTRFS_I(inode), argp); case BTRFS_IOC_SUBVOL_SETFLAGS: return btrfs_ioctl_subvol_setflags(file, argp); case BTRFS_IOC_DEFAULT_SUBVOL: @@ -5201,9 +5257,9 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_DEV_INFO: return btrfs_ioctl_dev_info(fs_info, argp); case BTRFS_IOC_TREE_SEARCH: - return btrfs_ioctl_tree_search(inode, argp); + return btrfs_ioctl_tree_search(root, argp); case BTRFS_IOC_TREE_SEARCH_V2: - return btrfs_ioctl_tree_search_v2(inode, argp); + return btrfs_ioctl_tree_search_v2(root, argp); case BTRFS_IOC_INO_LOOKUP: return btrfs_ioctl_ino_lookup(root, argp); case BTRFS_IOC_INO_PATHS: @@ -5251,10 +5307,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_set_received_subvol_32(file, argp); #endif case BTRFS_IOC_SEND: - return _btrfs_ioctl_send(BTRFS_I(inode), argp, false); + return _btrfs_ioctl_send(root, argp, false); #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) case BTRFS_IOC_SEND_32: - return _btrfs_ioctl_send(BTRFS_I(inode), argp, true); + return _btrfs_ioctl_send(root, argp, true); #endif case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(fs_info, argp); @@ -5290,6 +5346,8 @@ long btrfs_ioctl(struct file *file, unsigned int return fsverity_ioctl_enable(file, (const void __user *)argp); case FS_IOC_MEASURE_VERITY: return fsverity_ioctl_measure(file, argp); + case FS_IOC_READ_VERITY_METADATA: + return fsverity_ioctl_read_metadata(file, argp); case BTRFS_IOC_ENCODED_READ: return btrfs_ioctl_encoded_read(file, argp, false); case BTRFS_IOC_ENCODED_WRITE: diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 2b760c8778f8..ccf6bed9cc24 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -8,18 +8,19 @@ struct file; struct dentry; struct mnt_idmap; -struct fileattr; +struct file_kattr; +struct io_uring_cmd; +struct btrfs_inode; struct btrfs_fs_info; struct btrfs_ioctl_balance_args; long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int btrfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa); int btrfs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); int btrfs_ioctl_get_supported_features(void __user *arg); -void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); -int __pure btrfs_is_empty_uuid(const u8 *uuid); +void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 9a7a7b723305..a3e6d9616e60 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -9,7 +9,6 @@ #include <linux/page-flags.h> #include <asm/bug.h> #include <trace/events/btrfs.h> -#include "misc.h" #include "ctree.h" #include "extent_io.h" #include "locking.h" @@ -150,15 +149,15 @@ void btrfs_tree_read_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesti /* * Try-lock for read. * - * Return 1 if the rwlock has been taken, 0 otherwise + * Return true if the rwlock has been taken, false otherwise */ -int btrfs_try_tree_read_lock(struct extent_buffer *eb) +bool btrfs_try_tree_read_lock(struct extent_buffer *eb) { if (down_read_trylock(&eb->lock)) { trace_btrfs_try_tree_read_lock(eb); - return 1; + return true; } - return 0; + return false; } /* diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 46c8be2afab1..af29df98ac14 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -129,6 +129,16 @@ enum btrfs_lockdep_trans_states { rwsem_release(&owner->lock##_map, _THIS_IP_) /* + * Used to account for the fact that when doing io_uring encoded I/O, we can + * return to userspace with the inode lock still held. + */ +#define btrfs_lockdep_inode_acquire(owner, lock) \ + rwsem_acquire_read(&owner->vfs_inode.lock.dep_map, 0, 0, _THIS_IP_) + +#define btrfs_lockdep_inode_release(owner, lock) \ + rwsem_release(&owner->vfs_inode.lock.dep_map, _THIS_IP_) + +/* * Macros for the transaction states wait events, similar to the generic wait * event macros. */ @@ -179,7 +189,7 @@ static inline void btrfs_tree_read_lock(struct extent_buffer *eb) } void btrfs_tree_read_unlock(struct extent_buffer *eb); -int btrfs_try_tree_read_lock(struct extent_buffer *eb); +bool btrfs_try_tree_read_lock(struct extent_buffer *eb); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root); @@ -189,8 +199,13 @@ static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { lockdep_assert_held_write(&eb->lock); } +static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb) +{ + lockdep_assert_held_read(&eb->lock); +} #else static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { } +static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { } #endif void btrfs_unlock_up_safe(struct btrfs_path *path, int level); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index a45bc11f8665..d403641889ca 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -252,9 +252,8 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, /* Compress at most one sector of data each time */ in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off); ASSERT(in_len); - data_in = kmap_local_folio(folio_in, 0); - ret = lzo1x_1_compress(data_in + - offset_in_page(cur_in), in_len, + data_in = kmap_local_folio(folio_in, offset_in_folio(folio_in, cur_in)); + ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, &out_len, workspace->mem); kunmap_local(data_in); diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 08a9272399d2..022ebc89af85 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -4,6 +4,7 @@ #define BTRFS_MESSAGES_H #include <linux/types.h> +#include <linux/types.h> #include <linux/printk.h> #include <linux/bug.h> @@ -36,106 +37,46 @@ void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); btrfs_no_printk(fs_info, fmt, ##args) #endif -#define btrfs_emerg(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_INFO fmt, ##args) - /* - * Wrappers that use printk_in_rcu + * Print a message with filesystem info, enclosed in RCU protection. */ -#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit_in_rcu(fs_info, fmt, args...) \ +#define btrfs_crit(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err_in_rcu(fs_info, fmt, args...) \ +#define btrfs_err(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn_in_rcu(fs_info, fmt, args...) \ +#define btrfs_warn(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info_in_rcu(fs_info, fmt, args...) \ +#define btrfs_info(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args) /* - * Wrappers that use a ratelimited printk_in_rcu - */ -#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args) - -/* * Wrappers that use a ratelimited printk */ -#define btrfs_emerg_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args) #define btrfs_crit_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args) #define btrfs_err_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args) #define btrfs_warn_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args) #define btrfs_info_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args) #if defined(CONFIG_DYNAMIC_DEBUG) #define btrfs_debug(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk, \ - fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \ fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \ - fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_rl(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \ + _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \ fs_info, KERN_DEBUG fmt, ##args) #elif defined(DEBUG) #define btrfs_debug(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args) #else -#define btrfs_debug(fs_info, fmt, args...) \ - btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ - btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl(fs_info, fmt, args...) \ - btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) +/* When printk() is no_printk(), expand to no-op. */ +#define btrfs_debug(fs_info, fmt, args...) do { (void)(fs_info); } while(0) +#define btrfs_debug_rl(fs_info, fmt, args...) do { (void)(fs_info); } while(0) #endif #define btrfs_printk_in_rcu(fs_info, fmt, args...) \ @@ -145,40 +86,97 @@ do { \ rcu_read_unlock(); \ } while (0) -#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \ -do { \ - rcu_read_lock(); \ - btrfs_no_printk(fs_info, fmt, ##args); \ - rcu_read_unlock(); \ -} while (0) - -#define btrfs_printk_ratelimited(fs_info, fmt, args...) \ +#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ + \ + rcu_read_lock(); \ if (__ratelimit(&_rs)) \ btrfs_printk(fs_info, fmt, ##args); \ -} while (0) - -#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ -do { \ - rcu_read_lock(); \ - btrfs_printk_ratelimited(fs_info, fmt, ##args); \ rcu_read_unlock(); \ } while (0) #ifdef CONFIG_BTRFS_ASSERT -#define btrfs_assertfail(expr, file, line) ({ \ - pr_err("assertion failed: %s, in %s:%d\n", (expr), (file), (line)); \ - BUG(); \ -}) +__printf(1, 2) +static inline void verify_assert_printk_format(const char *fmt, ...) { + /* Stub to verify the assertion format string. */ +} + +/* Take the first token if any. */ +#define __FIRST_ARG(_, ...) _ +/* + * Skip the first token and return the rest, if it's empty the comma is dropped. + * As ##__VA_ARGS__ cannot be at the beginning of the macro the __VA_OPT__ is needed + * and supported since GCC 8 and Clang 12. + */ +#define __REST_ARGS(_, ... ) __VA_OPT__(,) __VA_ARGS__ + +#if defined(CONFIG_CC_IS_CLANG) || GCC_VERSION >= 80000 +/* + * Assertion with optional printk() format. + * + * Accepted syntax: + * ASSERT(condition); + * ASSERT(condition, "string"); + * ASSERT(condition, "variable=%d", variable); + * + * How it works: + * - if there's no format string, ""[0] evaluates at compile time to 0 and the + * true branch is executed + * - any non-empty format string with the "" prefix evaluates to != 0 at + * compile time and the false branch is executed + * - stringified condition is printed as %s so we don't accidentally mix format + * strings (the % operator) + * - there can be only one printk() call, so the format strings and arguments are + * spliced together: + * DEFAULT_FMT [USER_FMT], DEFAULT_ARGS [, USER_ARGS] + * - comma between DEFAULT_ARGS and USER_ARGS is handled by preprocessor + * (requires __VA_OPT__ support) + * - otherwise we could use __VA_OPT(,) __VA_ARGS__ for the 2nd+ argument of args, + */ +#define ASSERT(cond, args...) \ +do { \ + verify_assert_printk_format("check the format string" args); \ + if (!likely(cond)) { \ + if (("" __FIRST_ARG(args) [0]) == 0) { \ + pr_err("assertion failed: %s :: %ld, in %s:%d\n", \ + #cond, (long)(cond), __FILE__, __LINE__); \ + } else { \ + pr_err("assertion failed: %s :: %ld, in %s:%d (" __FIRST_ARG(args) ")\n", \ + #cond, (long)(cond), __FILE__, __LINE__ __REST_ARGS(args)); \ + } \ + BUG(); \ + } \ +} while(0) + +#else + +/* For GCC < 8.x only the simple output. */ + +#define ASSERT(cond, args...) \ +do { \ + verify_assert_printk_format("check the format string" args); \ + if (!likely(cond)) { \ + pr_err("assertion failed: %s :: %ld, in %s:%d\n", \ + #cond, (long)(cond), __FILE__, __LINE__); \ + BUG(); \ + } \ +} while(0) + +#endif + +#else +#define ASSERT(cond, args...) (void)(cond) +#endif -#define ASSERT(expr) \ - (likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__)) +#ifdef CONFIG_BTRFS_DEBUG +/* Verbose warning only under debug build. */ +#define DEBUG_WARN(args...) WARN(1, KERN_ERR args) #else -#define ASSERT(expr) (void)(expr) +#define DEBUG_WARN(...) do {} while(0) #endif __printf(5, 6) diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 0d599fd847c9..ff5eac84d819 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -7,6 +7,8 @@ #include <linux/bitmap.h> #include <linux/sched.h> #include <linux/wait.h> +#include <linux/mm.h> +#include <linux/pagemap.h> #include <linux/math64.h> #include <linux/rbtree.h> @@ -119,28 +121,23 @@ static inline struct rb_node *rb_simple_search_first(const struct rb_root *root, return ret; } -static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr, - struct rb_node *node) +static int rb_simple_node_bytenr_cmp(struct rb_node *new, const struct rb_node *existing) { - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct rb_simple_node *entry; + struct rb_simple_node *new_entry = rb_entry(new, struct rb_simple_node, rb_node); + struct rb_simple_node *existing_entry = rb_entry(existing, struct rb_simple_node, rb_node); - while (*p) { - parent = *p; - entry = rb_entry(parent, struct rb_simple_node, rb_node); + if (new_entry->bytenr < existing_entry->bytenr) + return -1; + else if (new_entry->bytenr > existing_entry->bytenr) + return 1; - if (bytenr < entry->bytenr) - p = &(*p)->rb_left; - else if (bytenr > entry->bytenr) - p = &(*p)->rb_right; - else - return parent; - } + return 0; +} - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; +static inline struct rb_node *rb_simple_insert(struct rb_root *root, + struct rb_simple_node *simple_node) +{ + return rb_find_add(&simple_node->rb_node, root, rb_simple_node_bytenr_cmp); } static inline bool bitmap_test_range_all_set(const unsigned long *addr, @@ -163,4 +160,9 @@ static inline bool bitmap_test_range_all_zero(const unsigned long *addr, return (found_set == start + nbits); } +static inline u64 folio_end(struct folio *folio) +{ + return folio_pos(folio) + folio_size(folio); +} + #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 30eceaf829a7..2829f20d7bb5 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -153,25 +153,30 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( struct btrfs_ordered_extent *entry; int ret; u64 qgroup_rsv = 0; + const bool is_nocow = (flags & + ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC))); - if (flags & - ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) { - /* For nocow write, we can release the qgroup rsv right now */ + /* + * For a NOCOW write we can free the qgroup reserve right now. For a COW + * one we transfer the reserved space from the inode's iotree into the + * ordered extent by calling btrfs_qgroup_release_data() and tracking + * the qgroup reserved amount in the ordered extent, so that later after + * completing the ordered extent, when running the data delayed ref it + * creates, we free the reserved data with btrfs_qgroup_free_refroot(). + */ + if (is_nocow) ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv); - if (ret < 0) - return ERR_PTR(ret); - } else { - /* - * The ordered extent has reserved qgroup space, release now - * and pass the reserved number for qgroup_record to free. - */ + else ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv); - if (ret < 0) - return ERR_PTR(ret); - } + + if (ret < 0) + return ERR_PTR(ret); + entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); - if (!entry) - return ERR_PTR(-ENOMEM); + if (!entry) { + entry = ERR_PTR(-ENOMEM); + goto out; + } entry->file_offset = file_offset; entry->num_bytes = num_bytes; @@ -180,7 +185,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( entry->disk_num_bytes = disk_num_bytes; entry->offset = offset; entry->bytes_left = num_bytes; - entry->inode = BTRFS_I(igrab(&inode->vfs_inode)); + if (WARN_ON_ONCE(!igrab(&inode->vfs_inode))) { + kmem_cache_free(btrfs_ordered_extent_cache, entry); + entry = ERR_PTR(-ESTALE); + goto out; + } + entry->inode = inode; entry->compress_type = compress_type; entry->truncated_len = (u64)-1; entry->qgroup_rsv = qgroup_rsv; @@ -203,6 +213,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( btrfs_mod_outstanding_extents(inode, 1); spin_unlock(&inode->lock); +out: + if (IS_ERR(entry) && !is_nocow) + btrfs_qgroup_free_refroot(inode->root->fs_info, + btrfs_root_id(inode->root), + qgroup_rsv, BTRFS_QGROUP_RSV_DATA); + return entry; } @@ -253,7 +269,7 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry) * @disk_bytenr: Offset of extent on disk. * @disk_num_bytes: Size of extent on disk. * @offset: Offset into unencoded data where file data starts. - * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*). + * @flags: Flags specifying type of extent (1U << BTRFS_ORDERED_*). * @compress_type: Compression algorithm used for data. * * Most of these parameters correspond to &struct btrfs_file_extent_item. The @@ -343,7 +359,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, if (folio) { ASSERT(folio->mapping); ASSERT(folio_pos(folio) <= file_offset); - ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio)); + ASSERT(file_offset + len <= folio_end(folio)); /* * Ordered flag indicates whether we still have @@ -607,23 +623,18 @@ out: */ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) { - struct list_head *cur; - struct btrfs_ordered_sum *sum; - trace_btrfs_ordered_extent_put(entry->inode, entry); if (refcount_dec_and_test(&entry->refs)) { + struct btrfs_ordered_sum *sum; + struct btrfs_ordered_sum *tmp; + ASSERT(list_empty(&entry->root_extent_list)); ASSERT(list_empty(&entry->log_list)); ASSERT(RB_EMPTY_NODE(&entry->rb_node)); - if (entry->inode) - btrfs_add_delayed_iput(entry->inode); - while (!list_empty(&entry->list)) { - cur = entry->list.next; - sum = list_entry(cur, struct btrfs_ordered_sum, list); - list_del(&sum->list); + btrfs_add_delayed_iput(entry->inode); + list_for_each_entry_safe(sum, tmp, &entry->list, list) kvfree(sum); - } kmem_cache_free(btrfs_ordered_extent_cache, entry); } } @@ -842,10 +853,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, /* * Start IO and wait for a given ordered extent to finish. * - * Wait on page writeback for all the pages in the extent and the IO completion - * code to insert metadata into the btree corresponding to the extent. + * Wait on page writeback for all the pages in the extent but not in + * [@nowriteback_start, @nowriteback_start + @nowriteback_len) and the + * IO completion code to insert metadata into the btree corresponding to the extent. */ -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) +void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry, + u64 nowriteback_start, u32 nowriteback_len) { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; @@ -865,8 +878,19 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) * start IO on any dirty ones so the wait doesn't stall waiting * for the flusher thread to find them */ - if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) - filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); + if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) { + if (!nowriteback_len) { + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); + } else { + if (start < nowriteback_start) + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, + nowriteback_start - 1); + if (nowriteback_start + nowriteback_len < end) + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, + nowriteback_start + nowriteback_len, + end); + } + } if (!freespace_inode) btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); @@ -1160,7 +1184,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, cachedp = cached_state; while (1) { - lock_extent(&inode->io_tree, start, end, cachedp); + btrfs_lock_extent(&inode->io_tree, start, end, cachedp); ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); if (!ordered) { @@ -1173,7 +1197,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, refcount_dec(&cache->refs); break; } - unlock_extent(&inode->io_tree, start, end, cachedp); + btrfs_unlock_extent(&inode->io_tree, start, end, cachedp); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } @@ -1191,7 +1215,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, { struct btrfs_ordered_extent *ordered; - if (!try_lock_extent(&inode->io_tree, start, end, cached_state)) + if (!btrfs_try_lock_extent(&inode->io_tree, start, end, cached_state)) return false; ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); @@ -1199,7 +1223,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, return true; btrfs_put_ordered_extent(ordered); - unlock_extent(&inode->io_tree, start, end, cached_state); + btrfs_unlock_extent(&inode->io_tree, start, end, cached_state); return false; } @@ -1229,6 +1253,18 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( */ if (WARN_ON_ONCE(len >= ordered->num_bytes)) return ERR_PTR(-EINVAL); + /* + * If our ordered extent had an error there's no point in continuing. + * The error may have come from a transaction abort done either by this + * task or some other concurrent task, and the transaction abort path + * iterates over all existing ordered extents and sets the flag + * BTRFS_ORDERED_IOERR on them. + */ + if (unlikely(flags & (1U << BTRFS_ORDERED_IOERR))) { + const int fs_error = BTRFS_FS_ERROR(fs_info); + + return fs_error ? ERR_PTR(fs_error) : ERR_PTR(-EIO); + } /* We cannot split partially completed ordered extents. */ if (ordered->bytes_left) { ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS)); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 4e152736d06c..1e6b0b182b29 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -17,6 +17,7 @@ struct inode; struct page; struct extent_state; +struct btrfs_block_group; struct btrfs_inode; struct btrfs_root; struct btrfs_fs_info; @@ -191,7 +192,13 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry); +void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry, + u64 nowriteback_start, u32 nowriteback_len); +static inline void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) +{ + return btrfs_start_ordered_extent_nowriteback(entry, 0, 0); +} + int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index fc821aa446f0..74e38da9bd39 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -190,7 +190,7 @@ static void print_uuid_item(const struct extent_buffer *l, unsigned long offset, u32 item_size) { if (!IS_ALIGNED(item_size, sizeof(u64))) { - pr_warn("BTRFS: uuid item with illegal size %lu!\n", + btrfs_warn(l->fs_info, "uuid item with illegal size %lu", (unsigned long)item_size); return; } @@ -223,7 +223,7 @@ static void print_eb_refs_lock(const struct extent_buffer *eb) { #ifdef CONFIG_BTRFS_DEBUG btrfs_info(eb->fs_info, "refs %u lock_owner %u current %u", - atomic_read(&eb->refs), eb->lock_owner, current->pid); + refcount_read(&eb->refs), eb->lock_owner, current->pid); #endif } diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index 8504bf1702c7..d0e620bf5f5a 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -6,6 +6,8 @@ #ifndef BTRFS_PRINT_TREE_H #define BTRFS_PRINT_TREE_H +#include <linux/types.h> + /* Buffer size to contain tree name and possibly additional data (offset) */ #define BTRFS_ROOT_NAME_BUF_LEN 48 diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index b8fa34e16abb..adc956432d2f 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -26,8 +26,8 @@ struct prop_handler { const char *xattr_name; int (*validate)(const struct btrfs_inode *inode, const char *value, size_t len); - int (*apply)(struct inode *inode, const char *value, size_t len); - const char *(*extract)(const struct inode *inode); + int (*apply)(struct btrfs_inode *inode, const char *value, size_t len); + const char *(*extract)(const struct btrfs_inode *inode); bool (*ignore)(const struct btrfs_inode *inode); int inheritable; }; @@ -121,7 +121,7 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, if (ret) return ret; - ret = handler->apply(&inode->vfs_inode, NULL, 0); + ret = handler->apply(inode, NULL, 0); ASSERT(ret == 0); return ret; @@ -131,7 +131,7 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, value_len, flags); if (ret) return ret; - ret = handler->apply(&inode->vfs_inode, value, value_len); + ret = handler->apply(inode, value, value_len); if (ret) { btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, NULL, 0, flags); @@ -263,7 +263,7 @@ static void inode_prop_iterator(void *ctx, struct btrfs_root *root = BTRFS_I(inode)->root; int ret; - ret = handler->apply(inode, value, len); + ret = handler->apply(BTRFS_I(inode), value, len); if (unlikely(ret)) btrfs_warn(root->fs_info, "error applying prop %s to ino %llu (root %llu): %d", @@ -273,12 +273,13 @@ static void inode_prop_iterator(void *ctx, set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); } -int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path) +int btrfs_load_inode_props(struct btrfs_inode *inode, struct btrfs_path *path) { - struct btrfs_root *root = BTRFS_I(inode)->root; - u64 ino = btrfs_ino(BTRFS_I(inode)); + struct btrfs_root *root = inode->root; + u64 ino = btrfs_ino(inode); - return iterate_object_props(root, path, ino, inode_prop_iterator, inode); + return iterate_object_props(root, path, ino, inode_prop_iterator, + &inode->vfs_inode); } static int prop_compression_validate(const struct btrfs_inode *inode, @@ -300,26 +301,26 @@ static int prop_compression_validate(const struct btrfs_inode *inode, return -EINVAL; } -static int prop_compression_apply(struct inode *inode, const char *value, +static int prop_compression_apply(struct btrfs_inode *inode, const char *value, size_t len) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; int type; /* Reset to defaults */ if (len == 0) { - BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; + inode->flags &= ~BTRFS_INODE_COMPRESS; + inode->flags &= ~BTRFS_INODE_NOCOMPRESS; + inode->prop_compress = BTRFS_COMPRESS_NONE; return 0; } /* Set NOCOMPRESS flag */ if ((len == 2 && strncmp("no", value, 2) == 0) || (len == 4 && strncmp("none", value, 4) == 0)) { - BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; + inode->flags |= BTRFS_INODE_NOCOMPRESS; + inode->flags &= ~BTRFS_INODE_COMPRESS; + inode->prop_compress = BTRFS_COMPRESS_NONE; return 0; } @@ -336,9 +337,9 @@ static int prop_compression_apply(struct inode *inode, const char *value, return -EINVAL; } - BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->prop_compress = type; + inode->flags &= ~BTRFS_INODE_NOCOMPRESS; + inode->flags |= BTRFS_INODE_COMPRESS; + inode->prop_compress = type; return 0; } @@ -359,13 +360,13 @@ static bool prop_compression_ignore(const struct btrfs_inode *inode) return false; } -static const char *prop_compression_extract(const struct inode *inode) +static const char *prop_compression_extract(const struct btrfs_inode *inode) { - switch (BTRFS_I(inode)->prop_compress) { + switch (inode->prop_compress) { case BTRFS_COMPRESS_ZLIB: case BTRFS_COMPRESS_LZO: case BTRFS_COMPRESS_ZSTD: - return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress); + return btrfs_compress_type2str(inode->prop_compress); default: break; } @@ -385,16 +386,16 @@ static struct prop_handler prop_handlers[] = { }; int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, - struct inode *inode, const struct inode *parent) + struct btrfs_inode *inode, + const struct btrfs_inode *parent) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; int ret; int i; bool need_reserve = false; - if (!test_bit(BTRFS_INODE_HAS_PROPS, - &BTRFS_I(parent)->runtime_flags)) + if (!test_bit(BTRFS_INODE_HAS_PROPS, &parent->runtime_flags)) return 0; for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { @@ -405,7 +406,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, if (!h->inheritable) continue; - if (h->ignore(BTRFS_I(inode))) + if (h->ignore(inode)) continue; value = h->extract(parent); @@ -416,7 +417,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, * This is not strictly necessary as the property should be * valid, but in case it isn't, don't propagate it further. */ - ret = h->validate(BTRFS_I(inode), value, strlen(value)); + ret = h->validate(inode, value, strlen(value)); if (ret) continue; @@ -436,16 +437,15 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, return ret; } - ret = btrfs_setxattr(trans, inode, h->xattr_name, value, + ret = btrfs_setxattr(trans, &inode->vfs_inode, h->xattr_name, value, strlen(value), 0); if (!ret) { ret = h->apply(inode, value, strlen(value)); if (ret) - btrfs_setxattr(trans, inode, h->xattr_name, + btrfs_setxattr(trans, &inode->vfs_inode, h->xattr_name, NULL, 0, 0); else - set_bit(BTRFS_INODE_HAS_PROPS, - &BTRFS_I(inode)->runtime_flags); + set_bit(BTRFS_INODE_HAS_PROPS, &inode->runtime_flags); } if (need_reserve) { diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index 63546d0a9444..15d9a025c923 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -6,9 +6,9 @@ #ifndef BTRFS_PROPS_H #define BTRFS_PROPS_H +#include <linux/types.h> #include <linux/compiler_types.h> -struct inode; struct btrfs_inode; struct btrfs_path; struct btrfs_trans_handle; @@ -22,10 +22,10 @@ int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name, const char *value, size_t value_len); bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name); -int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path); +int btrfs_load_inode_props(struct btrfs_inode *inode, struct btrfs_path *path); int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, - struct inode *inode, - const struct inode *dir); + struct btrfs_inode *inode, + const struct btrfs_inode *dir); #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index a6f92836c9b1..1a5972178b3a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -83,7 +83,7 @@ static void qgroup_rsv_add(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup, u64 num_bytes, enum btrfs_qgroup_rsv_type type) { - trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type); + trace_btrfs_qgroup_update_reserve(fs_info, qgroup, num_bytes, type); qgroup->rsv.values[type] += num_bytes; } @@ -91,7 +91,7 @@ static void qgroup_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup, u64 num_bytes, enum btrfs_qgroup_rsv_type type) { - trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type); + trace_btrfs_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type); if (qgroup->rsv.values[type] >= num_bytes) { qgroup->rsv.values[type] -= num_bytes; return; @@ -160,23 +160,34 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, int init_flags); static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info); +static int btrfs_qgroup_qgroupid_key_cmp(const void *key, const struct rb_node *node) +{ + const u64 *qgroupid = key; + const struct btrfs_qgroup *qgroup = rb_entry(node, struct btrfs_qgroup, node); + + if (qgroup->qgroupid < *qgroupid) + return -1; + else if (qgroup->qgroupid > *qgroupid) + return 1; + + return 0; +} + /* must be called with qgroup_ioctl_lock held */ static struct btrfs_qgroup *find_qgroup_rb(const struct btrfs_fs_info *fs_info, u64 qgroupid) { - struct rb_node *n = fs_info->qgroup_tree.rb_node; - struct btrfs_qgroup *qgroup; + struct rb_node *node; - while (n) { - qgroup = rb_entry(n, struct btrfs_qgroup, node); - if (qgroup->qgroupid < qgroupid) - n = n->rb_left; - else if (qgroup->qgroupid > qgroupid) - n = n->rb_right; - else - return qgroup; - } - return NULL; + node = rb_find(&qgroupid, &fs_info->qgroup_tree, btrfs_qgroup_qgroupid_key_cmp); + return rb_entry_safe(node, struct btrfs_qgroup, node); +} + +static int btrfs_qgroup_qgroupid_cmp(struct rb_node *new, const struct rb_node *existing) +{ + const struct btrfs_qgroup *new_qgroup = rb_entry(new, struct btrfs_qgroup, node); + + return btrfs_qgroup_qgroupid_key_cmp(&new_qgroup->qgroupid, existing); } /* @@ -191,39 +202,25 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *prealloc, u64 qgroupid) { - struct rb_node **p = &fs_info->qgroup_tree.rb_node; - struct rb_node *parent = NULL; - struct btrfs_qgroup *qgroup; + struct rb_node *node; /* Caller must have pre-allocated @prealloc. */ ASSERT(prealloc); - while (*p) { - parent = *p; - qgroup = rb_entry(parent, struct btrfs_qgroup, node); - - if (qgroup->qgroupid < qgroupid) { - p = &(*p)->rb_left; - } else if (qgroup->qgroupid > qgroupid) { - p = &(*p)->rb_right; - } else { - kfree(prealloc); - return qgroup; - } + prealloc->qgroupid = qgroupid; + node = rb_find_add(&prealloc->node, &fs_info->qgroup_tree, btrfs_qgroup_qgroupid_cmp); + if (node) { + kfree(prealloc); + return rb_entry(node, struct btrfs_qgroup, node); } - qgroup = prealloc; - qgroup->qgroupid = qgroupid; - INIT_LIST_HEAD(&qgroup->groups); - INIT_LIST_HEAD(&qgroup->members); - INIT_LIST_HEAD(&qgroup->dirty); - INIT_LIST_HEAD(&qgroup->iterator); - INIT_LIST_HEAD(&qgroup->nested_iterator); - - rb_link_node(&qgroup->node, parent, p); - rb_insert_color(&qgroup->node, &fs_info->qgroup_tree); + INIT_LIST_HEAD(&prealloc->groups); + INIT_LIST_HEAD(&prealloc->members); + INIT_LIST_HEAD(&prealloc->dirty); + INIT_LIST_HEAD(&prealloc->iterator); + INIT_LIST_HEAD(&prealloc->nested_iterator); - return qgroup; + return prealloc; } static void __del_qgroup_rb(struct btrfs_qgroup *qgroup) @@ -349,13 +346,27 @@ int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid } #endif -static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info) +__printf(2, 3) +static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info, const char *fmt, ...) { + const u64 old_flags = fs_info->qgroup_flags; + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) return; fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT | BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); + if (!(old_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) { + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_warn_rl(fs_info, "qgroup marked inconsistent, %pV", &vaf); + va_end(args); + } } static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info, @@ -386,12 +397,6 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) if (!fs_info->quota_root) return 0; - fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); - if (!fs_info->qgroup_ulist) { - ret = -ENOMEM; - goto out; - } - path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; @@ -434,13 +439,10 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) goto out; } fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr); - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) { + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) qgroup_read_enable_gen(fs_info, l, slot, ptr); - } else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) { - qgroup_mark_inconsistent(fs_info); - btrfs_err(fs_info, - "qgroup generation mismatch, marked as inconsistent"); - } + else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) + qgroup_mark_inconsistent(fs_info, "qgroup generation mismatch"); rescan_progress = btrfs_qgroup_status_rescan(l, ptr); goto next1; } @@ -451,10 +453,8 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) qgroup = find_qgroup_rb(fs_info, found_key.offset); if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || - (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { - btrfs_err(fs_info, "inconsistent qgroup config"); - qgroup_mark_inconsistent(fs_info); - } + (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) + qgroup_mark_inconsistent(fs_info, "inconsistent qgroup config"); if (!qgroup) { struct btrfs_qgroup *prealloc; struct btrfs_root *tree_root = fs_info->tree_root; @@ -476,7 +476,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) * during mount before we start doing things like creating * subvolumes. */ - if (is_fstree(qgroup->qgroupid) && + if (btrfs_is_fstree(qgroup->qgroupid) && qgroup->qgroupid > tree_root->free_objectid) /* * Don't need to check against BTRFS_LAST_FREE_OBJECTID, @@ -581,8 +581,6 @@ out: if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ret = qgroup_rescan_init(fs_info, rescan_progress, 0); } else { - ulist_free(fs_info->qgroup_ulist); - fs_info->qgroup_ulist = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; btrfs_sysfs_del_qgroups(fs_info); } @@ -630,29 +628,30 @@ bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info) /* * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(), - * first two are in single-threaded paths.And for the third one, we have set - * quota_root to be null with qgroup_lock held before, so it is safe to clean - * up the in-memory structures without qgroup_lock held. + * first two are in single-threaded paths. */ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) { struct rb_node *n; struct btrfs_qgroup *qgroup; + /* + * btrfs_quota_disable() can be called concurrently with + * btrfs_qgroup_rescan() -> qgroup_rescan_zero_tracking(), so take the + * lock. + */ + spin_lock(&fs_info->qgroup_lock); while ((n = rb_first(&fs_info->qgroup_tree))) { qgroup = rb_entry(n, struct btrfs_qgroup, node); rb_erase(n, &fs_info->qgroup_tree); __del_qgroup_rb(qgroup); + spin_unlock(&fs_info->qgroup_lock); btrfs_sysfs_del_one_qgroup(fs_info, qgroup); kfree(qgroup); + spin_lock(&fs_info->qgroup_lock); } - /* - * We call btrfs_free_qgroup_config() when unmounting - * filesystem and disabling quota, so we set qgroup_ulist - * to be null here to avoid double free. - */ - ulist_free(fs_info->qgroup_ulist); - fs_info->qgroup_ulist = NULL; + spin_unlock(&fs_info->qgroup_lock); + btrfs_sysfs_del_qgroups(fs_info); } @@ -673,9 +672,6 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, key.offset = dst; ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); - - btrfs_mark_buffer_dirty(trans, path->nodes[0]); - btrfs_free_path(path); return ret; } @@ -752,8 +748,6 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0); btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0); - btrfs_mark_buffer_dirty(trans, leaf); - btrfs_release_path(path); key.type = BTRFS_QGROUP_LIMIT_KEY; @@ -771,8 +765,6 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); - btrfs_mark_buffer_dirty(trans, leaf); - ret = 0; out: btrfs_free_path(path); @@ -859,9 +851,6 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl); btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); - - btrfs_mark_buffer_dirty(trans, l); - out: btrfs_free_path(path); return ret; @@ -905,9 +894,6 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); - - btrfs_mark_buffer_dirty(trans, l); - out: btrfs_free_path(path); return ret; @@ -947,9 +933,6 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans) btrfs_set_qgroup_status_generation(l, ptr, trans->transid); btrfs_set_qgroup_status_rescan(l, ptr, fs_info->qgroup_rescan_progress.objectid); - - btrfs_mark_buffer_dirty(trans, l); - out: btrfs_free_path(path); return ret; @@ -972,8 +955,8 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, return -ENOMEM; key.objectid = 0; - key.offset = 0; key.type = 0; + key.offset = 0; while (1) { ret = btrfs_search_slot(trans, root, &key, path, -1, 1); @@ -1014,7 +997,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup = NULL; struct btrfs_qgroup *prealloc = NULL; struct btrfs_trans_handle *trans = NULL; - struct ulist *ulist = NULL; const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA); int ret = 0; int slot; @@ -1037,12 +1019,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, if (fs_info->quota_root) goto out; - ulist = ulist_alloc(GFP_KERNEL); - if (!ulist) { - ret = -ENOMEM; - goto out; - } - ret = btrfs_sysfs_add_qgroups(fs_info); if (ret < 0) goto out; @@ -1082,9 +1058,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, if (fs_info->quota_root) goto out; - fs_info->qgroup_ulist = ulist; - ulist = NULL; - /* * initially create the quota tree */ @@ -1121,6 +1094,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON; if (simple) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; + btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid); } else { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; @@ -1129,8 +1103,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, BTRFS_QGROUP_STATUS_FLAGS_MASK); btrfs_set_qgroup_status_rescan(leaf, ptr, 0); - btrfs_mark_buffer_dirty(trans, leaf); - key.objectid = 0; key.type = BTRFS_ROOT_REF_KEY; key.offset = 0; @@ -1172,11 +1144,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); prealloc = NULL; - if (IS_ERR(qgroup)) { - ret = PTR_ERR(qgroup); - btrfs_abort_transaction(trans, ret); - goto out_free_path; - } ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -1254,8 +1221,6 @@ out_add_root: spin_lock(&fs_info->qgroup_lock); fs_info->quota_root = quota_root; set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - if (simple) - btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); spin_unlock(&fs_info->qgroup_lock); /* Skip rescan for simple qgroups. */ @@ -1291,17 +1256,13 @@ out_free_root: if (ret) btrfs_put_root(quota_root); out: - if (ret) { - ulist_free(fs_info->qgroup_ulist); - fs_info->qgroup_ulist = NULL; + if (ret) btrfs_sysfs_del_qgroups(fs_info); - } mutex_unlock(&fs_info->qgroup_ioctl_lock); if (ret && trans) btrfs_end_transaction(trans); else if (trans) ret = btrfs_end_transaction(trans); - ulist_free(ulist); kfree(prealloc); return ret; } @@ -1373,11 +1334,14 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) /* * We have nothing held here and no trans handle, just return the error - * if there is one. + * if there is one and set back the quota enabled bit since we didn't + * actually disable quotas. */ ret = flush_reservations(fs_info); - if (ret) + if (ret) { + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); return ret; + } /* * 1 For the root item @@ -1698,9 +1662,6 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) struct btrfs_qgroup *prealloc = NULL; int ret = 0; - if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) - return 0; - mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { ret = -ENOTCONN; @@ -1839,9 +1800,19 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) * Thus its reserved space should all be zero, no matter if qgroup * is consistent or the mode. */ - WARN_ON(qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] || - qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] || - qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]); + if (qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] || + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] || + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]) { + DEBUG_WARN(); + btrfs_warn_rl(fs_info, +"to be deleted qgroup %u/%llu has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu", + btrfs_qgroup_level(qgroup->qgroupid), + btrfs_qgroup_subvolid(qgroup->qgroupid), + qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA], + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC], + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]); + + } /* * The same for rfer/excl numbers, but that's only if our qgroup is * consistent and if it's in regular qgroup mode. @@ -1850,15 +1821,15 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) */ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL && !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) { - if (WARN_ON(qgroup->rfer || qgroup->excl || - qgroup->rfer_cmpr || qgroup->excl_cmpr)) { - btrfs_warn_rl(fs_info, -"to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu", - btrfs_qgroup_level(qgroup->qgroupid), - btrfs_qgroup_subvolid(qgroup->qgroupid), - qgroup->rfer, qgroup->rfer_cmpr, - qgroup->excl, qgroup->excl_cmpr); - qgroup_mark_inconsistent(fs_info); + if (qgroup->rfer || qgroup->excl || + qgroup->rfer_cmpr || qgroup->excl_cmpr) { + DEBUG_WARN(); + qgroup_mark_inconsistent(fs_info, + "to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu", + btrfs_qgroup_level(qgroup->qgroupid), + btrfs_qgroup_subvolid(qgroup->qgroupid), + qgroup->rfer, qgroup->rfer_cmpr, + qgroup->excl, qgroup->excl_cmpr); } } del_qgroup_rb(fs_info, qgroupid); @@ -1881,18 +1852,15 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su struct btrfs_trans_handle *trans; int ret; - if (!is_fstree(subvolid) || !btrfs_qgroup_enabled(fs_info) || !fs_info->quota_root) + if (!btrfs_is_fstree(subvolid) || !btrfs_qgroup_enabled(fs_info) || + !fs_info->quota_root) return 0; /* * Commit current transaction to make sure all the rfer/excl numbers * get updated. */ - trans = btrfs_start_transaction(fs_info->quota_root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = btrfs_commit_transaction(trans); + ret = btrfs_commit_current_transaction(fs_info->quota_root); if (ret < 0) return ret; @@ -1905,8 +1873,11 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su /* * It's squota and the subvolume still has numbers needed for future * accounting, in this case we can not delete it. Just skip it. + * + * Or the qgroup is already removed by a qgroup rescan. For both cases we're + * safe to ignore them. */ - if (ret == -EBUSY) + if (ret == -EBUSY || ret == -ENOENT) ret = 0; return ret; } @@ -1977,11 +1948,8 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, spin_unlock(&fs_info->qgroup_lock); ret = update_qgroup_limit_item(trans, qgroup); - if (ret) { - qgroup_mark_inconsistent(fs_info); - btrfs_info(fs_info, "unable to update quota limit for %llu", - qgroupid); - } + if (ret) + qgroup_mark_inconsistent(fs_info, "qgroup item update error %d", ret); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); @@ -2036,7 +2004,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, ret = __xa_store(&delayed_refs->dirty_extents, index, record, GFP_ATOMIC); xa_unlock(&delayed_refs->dirty_extents); if (xa_is_err(ret)) { - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, "xarray insert error: %d", xa_err(ret)); return xa_err(ret); } @@ -2103,10 +2071,8 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, ret = btrfs_find_all_roots(&ctx, true); if (ret < 0) { - qgroup_mark_inconsistent(fs_info); - btrfs_warn(fs_info, -"error accounting new delayed refs extent (err code: %d), quota inconsistent", - ret); + qgroup_mark_inconsistent(fs_info, + "error accounting new delayed refs extent: %d", ret); return 0; } @@ -2350,7 +2316,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0); /* For src_path */ - atomic_inc(&src_eb->refs); + refcount_inc(&src_eb->refs); src_path->nodes[root_level] = src_eb; src_path->slots[root_level] = dst_path->slots[root_level]; src_path->locks[root_level] = 0; @@ -2583,7 +2549,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, goto out; } /* For dst_path */ - atomic_inc(&dst_eb->refs); + refcount_inc(&dst_eb->refs); dst_path->nodes[level] = dst_eb; dst_path->slots[level] = 0; dst_path->locks[level] = 0; @@ -2598,7 +2564,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, out: btrfs_free_path(dst_path); if (ret < 0) - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, "%s error: %d", __func__, ret); return ret; } @@ -2642,7 +2608,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, * mark qgroup inconsistent. */ if (root_level >= drop_subptree_thres) { - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, "subtree level reached threshold"); return 0; } @@ -2675,7 +2641,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, * walk back up the tree (adjusting slot pointers as we go) * and restart the search process. */ - atomic_inc(&root_eb->refs); /* For path */ + refcount_inc(&root_eb->refs); /* For path */ path->nodes[root_level] = root_eb; path->slots[root_level] = 0; path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ @@ -2846,8 +2812,8 @@ static void qgroup_update_counters(struct btrfs_fs_info *fs_info, cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); - trace_qgroup_update_counters(fs_info, qg, cur_old_count, - cur_new_count); + trace_btrfs_qgroup_update_counters(fs_info, qg, cur_old_count, + cur_new_count); /* Rfer update part */ if (cur_old_count == 0 && cur_new_count > 0) { @@ -2941,7 +2907,7 @@ static int maybe_fs_roots(struct ulist *roots) * trees. * If it contains a non-fs tree, it won't be shared with fs/subvol trees. */ - return is_fstree(unode->val); + return btrfs_is_fstree(unode->val); } int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, @@ -3109,8 +3075,7 @@ cleanup: kfree(record); } - trace_qgroup_num_dirty_extents(fs_info, trans->transid, - num_dirty_extents); + trace_btrfs_qgroup_num_dirty_extents(fs_info, trans->transid, num_dirty_extents); return ret; } @@ -3143,10 +3108,12 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) spin_unlock(&fs_info->qgroup_lock); ret = update_qgroup_info_item(trans, qgroup); if (ret) - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, + "qgroup info item update error %d", ret); ret = update_qgroup_limit_item(trans, qgroup); if (ret) - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, + "qgroup limit item update error %d", ret); spin_lock(&fs_info->qgroup_lock); } if (btrfs_qgroup_enabled(fs_info)) @@ -3157,7 +3124,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) ret = update_qgroup_status_item(trans); if (ret) - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, + "qgroup status item update error %d", ret); return ret; } @@ -3339,6 +3307,9 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, u32 level_size = 0; u64 nums; + if (!btrfs_qgroup_enabled(fs_info)) + return 0; + prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); if (!prealloc) return -ENOMEM; @@ -3362,8 +3333,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, if (!committing) mutex_lock(&fs_info->qgroup_ioctl_lock); - if (!btrfs_qgroup_enabled(fs_info)) - goto out; quota_root = fs_info->quota_root; if (!quota_root) { @@ -3564,7 +3533,7 @@ out: if (!committing) mutex_unlock(&fs_info->qgroup_ioctl_lock); if (need_rescan) - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, "qgroup inherit needs a rescan"); if (qlist_prealloc) { for (int i = 0; i < inherit->num_qgroups; i++) kfree(qlist_prealloc[i]); @@ -3598,7 +3567,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, int ret = 0; LIST_HEAD(qgroup_list); - if (!is_fstree(ref_root)) + if (!btrfs_is_fstree(ref_root)) return 0; if (num_bytes == 0) @@ -3658,7 +3627,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup; LIST_HEAD(qgroup_list); - if (!is_fstree(ref_root)) + if (!btrfs_is_fstree(ref_root)) return; if (num_bytes == 0) @@ -4046,12 +4015,21 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) qgroup_rescan_zero_tracking(fs_info); mutex_lock(&fs_info->qgroup_rescan_lock); - fs_info->qgroup_rescan_running = true; - btrfs_queue_work(fs_info->qgroup_rescan_workers, - &fs_info->qgroup_rescan_work); + /* + * The rescan worker is only for full accounting qgroups, check if it's + * enabled as it is pointless to queue it otherwise. A concurrent quota + * disable may also have just cleared BTRFS_FS_QUOTA_ENABLED. + */ + if (btrfs_qgroup_full_accounting(fs_info)) { + fs_info->qgroup_rescan_running = true; + btrfs_queue_work(fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); + } else { + ret = -ENOTCONN; + } mutex_unlock(&fs_info->qgroup_rescan_lock); - return 0; + return ret; } int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, @@ -4138,8 +4116,8 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode, * Now the entry is in [start, start + len), revert the * EXTENT_QGROUP_RESERVED bit. */ - clear_ret = clear_extent_bits(&inode->io_tree, entry_start, - entry_end, EXTENT_QGROUP_RESERVED); + clear_ret = btrfs_clear_extent_bit(&inode->io_tree, entry_start, entry_end, + EXTENT_QGROUP_RESERVED, NULL); if (!ret && clear_ret < 0) ret = clear_ret; @@ -4226,7 +4204,7 @@ static int qgroup_reserve_data(struct btrfs_inode *inode, int ret; if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(btrfs_root_id(root)) || len == 0) + !btrfs_is_fstree(btrfs_root_id(root)) || len == 0) return 0; /* @reserved parameter is mandatory for qgroup */ @@ -4241,8 +4219,9 @@ static int qgroup_reserve_data(struct btrfs_inode *inode, reserved = *reserved_ret; /* Record already reserved space */ orig_reserved = reserved->bytes_changed; - ret = set_record_extent_bits(&inode->io_tree, start, - start + len -1, EXTENT_QGROUP_RESERVED, reserved); + ret = btrfs_set_record_extent_bits(&inode->io_tree, start, + start + len - 1, EXTENT_QGROUP_RESERVED, + reserved); /* Newly reserved space */ to_reserve = reserved->bytes_changed - orig_reserved; @@ -4335,9 +4314,10 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode, * EXTENT_QGROUP_RESERVED, we won't double free. * So not need to rush. */ - ret = clear_record_extent_bits(&inode->io_tree, free_start, - free_start + free_len - 1, - EXTENT_QGROUP_RESERVED, &changeset); + ret = btrfs_clear_record_extent_bits(&inode->io_tree, free_start, + free_start + free_len - 1, + EXTENT_QGROUP_RESERVED, + &changeset); if (ret < 0) goto out; freed += changeset.bytes_changed; @@ -4361,9 +4341,9 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, int ret; if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) { - return clear_record_extent_bits(&inode->io_tree, start, - start + len - 1, - EXTENT_QGROUP_RESERVED, NULL); + return btrfs_clear_record_extent_bits(&inode->io_tree, start, + start + len - 1, + EXTENT_QGROUP_RESERVED, NULL); } /* In release case, we shouldn't have @reserved */ @@ -4371,8 +4351,8 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, if (free && reserved) return qgroup_free_reserved_data(inode, reserved, start, len, released); extent_changeset_init(&changeset); - ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, - EXTENT_QGROUP_RESERVED, &changeset); + ret = btrfs_clear_record_extent_bits(&inode->io_tree, start, start + len - 1, + EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) goto out; @@ -4477,11 +4457,11 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, int ret; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(btrfs_root_id(root)) || num_bytes == 0) + !btrfs_is_fstree(btrfs_root_id(root)) || num_bytes == 0) return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); - trace_qgroup_meta_reserve(root, (s64)num_bytes, type); + trace_btrfs_qgroup_meta_reserve(root, (s64)num_bytes, type); ret = qgroup_reserve(root, num_bytes, enforce, type); if (ret < 0) return ret; @@ -4522,11 +4502,11 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) struct btrfs_fs_info *fs_info = root->fs_info; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(btrfs_root_id(root))) + !btrfs_is_fstree(btrfs_root_id(root))) return; /* TODO: Update trace point to handle such free */ - trace_qgroup_meta_free_all_pertrans(root); + trace_btrfs_qgroup_meta_free_all_pertrans(root); /* Special value -1 means to free all reserved space */ btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), (u64)-1, BTRFS_QGROUP_RSV_META_PERTRANS); @@ -4538,7 +4518,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, struct btrfs_fs_info *fs_info = root->fs_info; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(btrfs_root_id(root))) + !btrfs_is_fstree(btrfs_root_id(root))) return; /* @@ -4548,7 +4528,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, */ num_bytes = sub_root_meta_rsv(root, num_bytes, type); BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); - trace_qgroup_meta_reserve(root, -(s64)num_bytes, type); + trace_btrfs_qgroup_meta_reserve(root, -(s64)num_bytes, type); btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), num_bytes, type); } @@ -4597,12 +4577,12 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) struct btrfs_fs_info *fs_info = root->fs_info; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(btrfs_root_id(root))) + !btrfs_is_fstree(btrfs_root_id(root))) return; /* Same as btrfs_qgroup_free_meta_prealloc() */ num_bytes = sub_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PREALLOC); - trace_qgroup_meta_convert(root, num_bytes); + trace_btrfs_qgroup_meta_convert(root, num_bytes); qgroup_convert_meta(fs_info, btrfs_root_id(root), num_bytes); if (!sb_rdonly(fs_info->sb)) add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS); @@ -4620,8 +4600,8 @@ void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode) int ret; extent_changeset_init(&changeset); - ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1, - EXTENT_QGROUP_RESERVED, &changeset); + ret = btrfs_clear_record_extent_bits(&inode->io_tree, 0, (u64)-1, + EXTENT_QGROUP_RESERVED, &changeset); WARN_ON(ret < 0); if (WARN_ON(changeset.bytes_changed)) { @@ -4681,6 +4661,28 @@ out: spin_unlock(&swapped_blocks->lock); } +static int qgroup_swapped_block_bytenr_key_cmp(const void *key, const struct rb_node *node) +{ + const u64 *bytenr = key; + const struct btrfs_qgroup_swapped_block *block = rb_entry(node, + struct btrfs_qgroup_swapped_block, node); + + if (block->subvol_bytenr < *bytenr) + return -1; + else if (block->subvol_bytenr > *bytenr) + return 1; + + return 0; +} + +static int qgroup_swapped_block_bytenr_cmp(struct rb_node *new, const struct rb_node *existing) +{ + const struct btrfs_qgroup_swapped_block *new_block = rb_entry(new, + struct btrfs_qgroup_swapped_block, node); + + return qgroup_swapped_block_bytenr_key_cmp(&new_block->subvol_bytenr, existing); +} + /* * Add subtree roots record into @subvol_root. * @@ -4700,8 +4702,7 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, struct btrfs_fs_info *fs_info = subvol_root->fs_info; struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks; struct btrfs_qgroup_swapped_block *block; - struct rb_node **cur; - struct rb_node *parent = NULL; + struct rb_node *node; int level = btrfs_header_level(subvol_parent) - 1; int ret = 0; @@ -4750,46 +4751,32 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, /* Insert @block into @blocks */ spin_lock(&blocks->lock); - cur = &blocks->blocks[level].rb_node; - while (*cur) { + node = rb_find_add(&block->node, &blocks->blocks[level], qgroup_swapped_block_bytenr_cmp); + if (node) { struct btrfs_qgroup_swapped_block *entry; - parent = *cur; - entry = rb_entry(parent, struct btrfs_qgroup_swapped_block, - node); + entry = rb_entry(node, struct btrfs_qgroup_swapped_block, node); - if (entry->subvol_bytenr < block->subvol_bytenr) { - cur = &(*cur)->rb_left; - } else if (entry->subvol_bytenr > block->subvol_bytenr) { - cur = &(*cur)->rb_right; - } else { - if (entry->subvol_generation != - block->subvol_generation || - entry->reloc_bytenr != block->reloc_bytenr || - entry->reloc_generation != - block->reloc_generation) { - /* - * Duplicated but mismatch entry found. - * Shouldn't happen. - * - * Marking qgroup inconsistent should be enough - * for end users. - */ - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); - ret = -EEXIST; - } - kfree(block); - goto out_unlock; + if (entry->subvol_generation != block->subvol_generation || + entry->reloc_bytenr != block->reloc_bytenr || + entry->reloc_generation != block->reloc_generation) { + /* + * Duplicated but mismatch entry found. Shouldn't happen. + * Marking qgroup inconsistent should be enough for end + * users. + */ + DEBUG_WARN("duplicated but mismatched entry found"); + ret = -EEXIST; } + kfree(block); + goto out_unlock; } - rb_link_node(&block->node, parent, cur); - rb_insert_color(&block->node, &blocks->blocks[level]); blocks->swapped = true; out_unlock: spin_unlock(&blocks->lock); out: if (ret < 0) - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, "%s error: %d", __func__, ret); return ret; } @@ -4809,7 +4796,6 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_qgroup_swapped_block *block; struct extent_buffer *reloc_eb = NULL; struct rb_node *node; - bool found = false; bool swapped = false; int level = btrfs_header_level(subvol_eb); int ret = 0; @@ -4817,7 +4803,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, if (!btrfs_qgroup_full_accounting(fs_info)) return 0; - if (!is_fstree(btrfs_root_id(root)) || !root->reloc_root) + if (!btrfs_is_fstree(btrfs_root_id(root)) || !root->reloc_root) return 0; spin_lock(&blocks->lock); @@ -4825,23 +4811,14 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, spin_unlock(&blocks->lock); return 0; } - node = blocks->blocks[level].rb_node; - - while (node) { - block = rb_entry(node, struct btrfs_qgroup_swapped_block, node); - if (block->subvol_bytenr < subvol_eb->start) { - node = node->rb_left; - } else if (block->subvol_bytenr > subvol_eb->start) { - node = node->rb_right; - } else { - found = true; - break; - } - } - if (!found) { + node = rb_find(&subvol_eb->start, &blocks->blocks[level], + qgroup_swapped_block_bytenr_key_cmp); + if (!node) { spin_unlock(&blocks->lock); goto out; } + block = rb_entry(node, struct btrfs_qgroup_swapped_block, node); + /* Found one, remove it from @blocks first and update blocks->swapped */ rb_erase(&block->node, &blocks->blocks[level]); for (i = 0; i < BTRFS_MAX_LEVEL; i++) { @@ -4877,10 +4854,9 @@ free_out: free_extent_buffer(reloc_eb); out: if (ret < 0) { - btrfs_err_rl(fs_info, - "failed to account subtree at bytenr %llu: %d", - subvol_eb->start, ret); - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, + "failed to account subtree at bytenr %llu: %d", + subvol_eb->start, ret); } return ret; } @@ -4911,7 +4887,7 @@ int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) return 0; - if (!is_fstree(root)) + if (!btrfs_is_fstree(root)) return 0; /* If the extent predates enabling quotas, don't count it. */ diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index e233cc79af18..a979fd59a4da 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -22,6 +22,9 @@ struct btrfs_ioctl_quota_ctl_args; struct btrfs_trans_handle; struct btrfs_delayed_ref_root; struct btrfs_inode; +struct btrfs_transaction; +struct btrfs_block_group; +struct btrfs_qgroup_swapped_blocks; /* * Btrfs qgroup overview diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 9ffc79f250fb..cab0b291088c 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -13,12 +13,13 @@ #include "volumes.h" #include "print-tree.h" -static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, +static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, const struct btrfs_key *oldkey, u64 newlen, u64 frontpad) { - struct btrfs_stripe_extent *extent; + struct btrfs_root *stripe_root = trans->fs_info->stripe_root; + struct btrfs_stripe_extent *extent, *newitem; struct extent_buffer *leaf; int slot; size_t item_size; @@ -27,23 +28,39 @@ static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, .type = BTRFS_RAID_STRIPE_KEY, .offset = newlen, }; + int ret; + ASSERT(newlen > 0); ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY); leaf = path->nodes[0]; slot = path->slots[0]; item_size = btrfs_item_size(leaf, slot); + + newitem = kzalloc(item_size, GFP_NOFS); + if (!newitem) + return -ENOMEM; + extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) { struct btrfs_raid_stride *stride = &extent->strides[i]; u64 phys; - phys = btrfs_raid_stride_physical(leaf, stride); - btrfs_set_raid_stride_physical(leaf, stride, phys + frontpad); + phys = btrfs_raid_stride_physical(leaf, stride) + frontpad; + btrfs_set_stack_raid_stride_physical(&newitem->strides[i], phys); } - btrfs_set_item_key_safe(trans, path, &newkey); + ret = btrfs_del_item(trans, stripe_root, path); + if (ret) + goto out; + + btrfs_release_path(path); + ret = btrfs_insert_item(trans, stripe_root, &newkey, newitem, item_size); + +out: + kfree(newitem); + return ret; } int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length) @@ -59,9 +76,22 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le int slot; int ret; - if (!stripe_root) + if (!btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE) || !stripe_root) return 0; + if (!btrfs_is_testing(fs_info)) { + struct btrfs_chunk_map *map; + bool use_rst; + + map = btrfs_find_chunk_map(fs_info, start, length); + if (!map) + return -EINVAL; + use_rst = btrfs_need_stripe_tree_update(fs_info, map->type); + btrfs_free_chunk_map(map); + if (!use_rst) + return 0; + } + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -85,6 +115,37 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le found_end = found_start + key.offset; ret = 0; + /* + * The stripe extent starts before the range we want to delete, + * but the range spans more than one stripe extent: + * + * |--- RAID Stripe Extent ---||--- RAID Stripe Extent ---| + * |--- keep ---|--- drop ---| + * + * This means we have to get the previous item, truncate its + * length and then restart the search. + */ + if (found_start > start) { + if (slot == 0) { + ret = btrfs_previous_item(stripe_root, path, start, + BTRFS_RAID_STRIPE_KEY); + if (ret) { + if (ret > 0) + ret = -ENOENT; + break; + } + } else { + path->slots[0]--; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + found_start = key.objectid; + found_end = found_start + key.offset; + ASSERT(found_start <= start); + } + if (key.type != BTRFS_RAID_STRIPE_KEY) break; @@ -96,6 +157,54 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le found_start, found_end); /* + * The stripe extent starts before the range we want to delete + * and ends after the range we want to delete, i.e. we're + * punching a hole in the stripe extent: + * + * |--- RAID Stripe Extent ---| + * | keep |--- drop ---| keep | + * + * This means we need to a) truncate the existing item and b) + * create a second item for the remaining range. + */ + if (found_start < start && found_end > end) { + size_t item_size; + u64 diff_start = start - found_start; + u64 diff_end = found_end - end; + struct btrfs_stripe_extent *extent; + struct btrfs_key newkey = { + .objectid = end, + .type = BTRFS_RAID_STRIPE_KEY, + .offset = diff_end, + }; + + /* The "right" item. */ + ret = btrfs_duplicate_item(trans, stripe_root, path, &newkey); + if (ret) + break; + + item_size = btrfs_item_size(leaf, path->slots[0]); + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_stripe_extent); + + for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) { + struct btrfs_raid_stride *stride = &extent->strides[i]; + u64 phys; + + phys = btrfs_raid_stride_physical(leaf, stride); + phys += diff_start + length; + btrfs_set_raid_stride_physical(leaf, stride, phys); + } + + /* The "left" item. */ + path->slots[0]--; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_partially_delete_raid_extent(trans, path, &key, + diff_start, 0); + break; + } + + /* * The stripe extent starts before the range we want to delete: * * |--- RAID Stripe Extent ---| @@ -105,11 +214,18 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le * length to the new size and then re-insert the item. */ if (found_start < start) { - u64 diff = start - found_start; + u64 diff_start = start - found_start; btrfs_partially_delete_raid_extent(trans, path, &key, - diff, 0); - break; + diff_start, 0); + + start += (key.offset - diff_start); + length -= (key.offset - diff_start); + if (length == 0) + break; + + btrfs_release_path(path); + continue; } /* @@ -122,13 +238,16 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le * length to the new size and then re-insert the item. */ if (found_end > end) { - u64 diff = found_end - end; + u64 diff_end = found_end - end; btrfs_partially_delete_raid_extent(trans, path, &key, - diff, diff); + key.offset - length, + length); + ASSERT(key.offset - diff_end == length); break; } + /* Finally we can delete the whole item, no more special cases. */ ret = btrfs_del_item(trans, stripe_root, path); if (ret) break; @@ -169,7 +288,6 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot), item_size); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); return ret; @@ -199,12 +317,8 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, for (int i = 0; i < num_stripes; i++) { u64 devid = bioc->stripes[i].dev->devid; u64 physical = bioc->stripes[i].physical; - u64 length = bioc->stripes[i].length; struct btrfs_raid_stride *raid_stride = &stripe_extent->strides[i]; - if (length == 0) - length = bioc->size; - btrfs_set_stack_raid_stride_devid(raid_stride, devid); btrfs_set_stack_raid_stride_physical(raid_stride, physical); } @@ -215,11 +329,14 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent, item_size); - if (ret == -EEXIST) + if (ret == -EEXIST) { ret = update_raid_extent_item(trans, &stripe_key, stripe_extent, item_size); - if (ret) + if (ret) + btrfs_abort_transaction(trans, ret); + } else if (ret) { btrfs_abort_transaction(trans, ret); + } kfree(stripe_extent); diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h index 541836421778..69942ad43140 100644 --- a/fs/btrfs/raid-stripe-tree.h +++ b/fs/btrfs/raid-stripe-tree.h @@ -9,6 +9,7 @@ #include <linux/types.h> #include <uapi/linux/btrfs_tree.h> #include "fs.h" +#include "accessors.h" #define BTRFS_RST_SUPP_BLOCK_GROUP_MASK (BTRFS_BLOCK_GROUP_DUP | \ BTRFS_BLOCK_GROUP_RAID1_MASK | \ diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index cdd373c27784..3ff2bedfb3a4 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -134,14 +134,17 @@ struct btrfs_stripe_hash_table { }; /* - * A bvec like structure to present a sector inside a page. - * - * Unlike bvec we don't need bvlen, as it's fixed to sectorsize. + * A structure to present a sector inside a page, the length is fixed to + * sectorsize; */ struct sector_ptr { - struct page *page; - unsigned int pgoff:24; - unsigned int uptodate:8; + /* + * Blocks from the bio list can still be highmem. + * So here we use physical address to present a page and the offset inside it. + */ + phys_addr_t paddr; + bool has_paddr; + bool uptodate; }; static void rmw_rbio_work(struct work_struct *work); @@ -200,8 +203,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) struct btrfs_stripe_hash_table *x; struct btrfs_stripe_hash *cur; struct btrfs_stripe_hash *h; - int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; - int i; + unsigned int num_entries = 1U << BTRFS_STRIPE_HASH_TABLE_BITS; if (info->stripe_hash_table) return 0; @@ -222,7 +224,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) h = table->table; - for (i = 0; i < num_entries; i++) { + for (unsigned int i = 0; i < num_entries; i++) { cur = h + i; INIT_LIST_HEAD(&cur->hash_list); spin_lock_init(&cur->lock); @@ -233,6 +235,14 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) return 0; } +static void memcpy_sectors(const struct sector_ptr *dst, + const struct sector_ptr *src, u32 blocksize) +{ + memcpy_page(phys_to_page(dst->paddr), offset_in_page(dst->paddr), + phys_to_page(src->paddr), offset_in_page(src->paddr), + blocksize); +} + /* * caching an rbio means to copy anything from the * bio_sectors array into the stripe_pages array. We @@ -253,7 +263,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) for (i = 0; i < rbio->nr_sectors; i++) { /* Some range not covered by bio (partial write), skip it */ - if (!rbio->bio_sectors[i].page) { + if (!rbio->bio_sectors[i].has_paddr) { /* * Even if the sector is not covered by bio, if it is * a data sector it should still be uptodate as it is @@ -264,12 +274,8 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) continue; } - ASSERT(rbio->stripe_sectors[i].page); - memcpy_page(rbio->stripe_sectors[i].page, - rbio->stripe_sectors[i].pgoff, - rbio->bio_sectors[i].page, - rbio->bio_sectors[i].pgoff, - rbio->bioc->fs_info->sectorsize); + memcpy_sectors(&rbio->stripe_sectors[i], &rbio->bio_sectors[i], + rbio->bioc->fs_info->sectorsize); rbio->stripe_sectors[i].uptodate = 1; } set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); @@ -326,8 +332,13 @@ static void index_stripe_sectors(struct btrfs_raid_bio *rbio) int page_index = offset >> PAGE_SHIFT; ASSERT(page_index < rbio->nr_pages); - rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index]; - rbio->stripe_sectors[i].pgoff = offset_in_page(offset); + if (!rbio->stripe_pages[page_index]) + continue; + + rbio->stripe_sectors[i].has_paddr = true; + rbio->stripe_sectors[i].paddr = + page_to_phys(rbio->stripe_pages[page_index]) + + offset_in_page(offset); } } @@ -507,9 +518,8 @@ static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) spin_lock(&table->cache_lock); while (!list_empty(&table->stripe_cache)) { - rbio = list_entry(table->stripe_cache.next, - struct btrfs_raid_bio, - stripe_cache); + rbio = list_first_entry(&table->stripe_cache, + struct btrfs_raid_bio, stripe_cache); __remove_rbio_from_cache(rbio); } spin_unlock(&table->cache_lock); @@ -567,9 +577,9 @@ static void cache_rbio(struct btrfs_raid_bio *rbio) if (table->cache_size > RBIO_CACHE_SIZE) { struct btrfs_raid_bio *found; - found = list_entry(table->stripe_cache.prev, - struct btrfs_raid_bio, - stripe_cache); + found = list_last_entry(&table->stripe_cache, + struct btrfs_raid_bio, + stripe_cache); if (found != rbio) __remove_rbio_from_cache(found); @@ -882,14 +892,14 @@ done_nolock: remove_rbio_from_cache(rbio); } -static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) +static void rbio_endio_bio_list(struct bio *cur, blk_status_t status) { struct bio *next; while (cur) { next = cur->bi_next; cur->bi_next = NULL; - cur->bi_status = err; + cur->bi_status = status; bio_endio(cur); cur = next; } @@ -899,7 +909,7 @@ static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) * this frees the rbio and runs through all the bios in the * bio_list and calls end_io on them */ -static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) +static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status) { struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *extra; @@ -928,9 +938,9 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) extra = bio_list_get(&rbio->bio_list); free_raid_bio(rbio); - rbio_endio_bio_list(cur, err); + rbio_endio_bio_list(cur, status); if (extra) - rbio_endio_bio_list(extra, err); + rbio_endio_bio_list(extra, status); } /* @@ -962,9 +972,9 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, spin_lock(&rbio->bio_list_lock); sector = &rbio->bio_sectors[index]; - if (sector->page || bio_list_only) { + if (sector->has_paddr || bio_list_only) { /* Don't return sector without a valid page pointer */ - if (!sector->page) + if (!sector->has_paddr) sector = NULL; spin_unlock(&rbio->bio_list_lock); return sector; @@ -1142,7 +1152,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, rbio, stripe_nr); ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, rbio, sector_nr); - ASSERT(sector->page); + ASSERT(sector->has_paddr); stripe = &rbio->bioc->stripes[stripe_nr]; disk_start = stripe->physical + sector_nr * sectorsize; @@ -1173,8 +1183,8 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, */ if (last_end == disk_start && !last->bi_status && last->bi_bdev == stripe->dev->bdev) { - ret = bio_add_page(last, sector->page, sectorsize, - sector->pgoff); + ret = bio_add_page(last, phys_to_page(sector->paddr), + sectorsize, offset_in_page(sector->paddr)); if (ret == sectorsize) return 0; } @@ -1187,7 +1197,8 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; bio->bi_private = rbio; - __bio_add_page(bio, sector->page, sectorsize, sector->pgoff); + __bio_add_page(bio, phys_to_page(sector->paddr), sectorsize, + offset_in_page(sector->paddr)); bio_list_add(bio_list, bio); return 0; } @@ -1195,23 +1206,20 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - struct bio_vec bvec; - struct bvec_iter iter; + const u32 sectorsize_bits = rbio->bioc->fs_info->sectorsize_bits; + struct bvec_iter iter = bio->bi_iter; u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - rbio->bioc->full_stripe_logical; - bio_for_each_segment(bvec, bio, iter) { - u32 bvec_offset; - - for (bvec_offset = 0; bvec_offset < bvec.bv_len; - bvec_offset += sectorsize, offset += sectorsize) { - int index = offset / sectorsize; - struct sector_ptr *sector = &rbio->bio_sectors[index]; + while (iter.bi_size) { + unsigned int index = (offset >> sectorsize_bits); + struct sector_ptr *sector = &rbio->bio_sectors[index]; + struct bio_vec bv = bio_iter_iovec(bio, iter); - sector->page = bvec.bv_page; - sector->pgoff = bvec.bv_offset + bvec_offset; - ASSERT(sector->pgoff < PAGE_SIZE); - } + sector->has_paddr = true; + sector->paddr = bvec_phys(&bv); + bio_advance_iter_single(bio, &iter, sectorsize); + offset += sectorsize; } } @@ -1289,6 +1297,15 @@ static void assert_rbio(struct btrfs_raid_bio *rbio) ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio); } +static inline void *kmap_local_sector(const struct sector_ptr *sector) +{ + /* The sector pointer must have a page mapped to it. */ + ASSERT(sector->has_paddr); + + return kmap_local_page(phys_to_page(sector->paddr)) + + offset_in_page(sector->paddr); +} + /* Generate PQ for one vertical stripe. */ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) { @@ -1301,14 +1318,13 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) /* First collect one sector from each data stripe */ for (stripe = 0; stripe < rbio->nr_data; stripe++) { sector = sector_in_rbio(rbio, stripe, sectornr, 0); - pointers[stripe] = kmap_local_page(sector->page) + - sector->pgoff; + pointers[stripe] = kmap_local_sector(sector); } /* Then add the parity stripe */ sector = rbio_pstripe_sector(rbio, sectornr); sector->uptodate = 1; - pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; + pointers[stripe++] = kmap_local_sector(sector); if (has_qstripe) { /* @@ -1317,8 +1333,7 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) */ sector = rbio_qstripe_sector(rbio, sectornr); sector->uptodate = 1; - pointers[stripe++] = kmap_local_page(sector->page) + - sector->pgoff; + pointers[stripe++] = kmap_local_sector(sector); assert_rbio(rbio); raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, @@ -1477,15 +1492,14 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) * stripe_pages[], thus we need to locate the sector. */ static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, - struct page *page, - unsigned int pgoff) + phys_addr_t paddr) { int i; for (i = 0; i < rbio->nr_sectors; i++) { struct sector_ptr *sector = &rbio->stripe_sectors[i]; - if (sector->page == page && sector->pgoff == pgoff) + if (sector->has_paddr && sector->paddr == paddr) return sector; } return NULL; @@ -1505,11 +1519,10 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) bio_for_each_segment_all(bvec, bio, iter_all) { struct sector_ptr *sector; - int pgoff; + phys_addr_t paddr = bvec_phys(bvec); - for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len; - pgoff += sectorsize) { - sector = find_stripe_sector(rbio, bvec->bv_page, pgoff); + for (u32 off = 0; off < bvec->bv_len; off += sectorsize) { + sector = find_stripe_sector(rbio, paddr + off); ASSERT(sector); if (sector) sector->uptodate = 1; @@ -1519,17 +1532,14 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) { - struct bio_vec *bv = bio_first_bvec_all(bio); + phys_addr_t bvec_paddr = bvec_phys(bio_first_bvec_all(bio)); int i; for (i = 0; i < rbio->nr_sectors; i++) { - struct sector_ptr *sector; - - sector = &rbio->stripe_sectors[i]; - if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) + if (rbio->stripe_sectors[i].paddr == bvec_paddr) break; - sector = &rbio->bio_sectors[i]; - if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) + if (rbio->bio_sectors[i].has_paddr && + rbio->bio_sectors[i].paddr == bvec_paddr) break; } ASSERT(i < rbio->nr_sectors); @@ -1575,11 +1585,11 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, return; bio_for_each_segment_all(bvec, bio, iter_all) { - int bv_offset; + void *kaddr; - for (bv_offset = bvec->bv_offset; - bv_offset < bvec->bv_offset + bvec->bv_len; - bv_offset += fs_info->sectorsize, total_sector_nr++) { + kaddr = bvec_kmap_local(bvec); + for (u32 off = 0; off < bvec->bv_len; + off += fs_info->sectorsize, total_sector_nr++) { u8 csum_buf[BTRFS_CSUM_SIZE]; u8 *expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size; @@ -1589,11 +1599,12 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, if (!test_bit(total_sector_nr, rbio->csum_bitmap)) continue; - ret = btrfs_check_sector_csum(fs_info, bvec->bv_page, - bv_offset, csum_buf, expected_csum); + ret = btrfs_check_sector_csum(fs_info, kaddr + off, + csum_buf, expected_csum); if (ret < 0) set_bit(total_sector_nr, rbio->error_bitmap); } + kunmap_local(kaddr); } } @@ -1689,8 +1700,8 @@ static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule) list_sort(NULL, &plug->rbio_list, plug_cmp); while (!list_empty(&plug->rbio_list)) { - cur = list_entry(plug->rbio_list.next, - struct btrfs_raid_bio, plug_list); + cur = list_first_entry(&plug->rbio_list, + struct btrfs_raid_bio, plug_list); list_del_init(&cur->plug_list); if (rbio_is_full(cur)) { @@ -1791,6 +1802,7 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, struct sector_ptr *sector; u8 csum_buf[BTRFS_CSUM_SIZE]; u8 *csum_expected; + void *kaddr; int ret; if (!rbio->csum_bitmap || !rbio->csum_buf) @@ -1809,13 +1821,12 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); } - ASSERT(sector->page); - csum_expected = rbio->csum_buf + (stripe_nr * rbio->stripe_nsectors + sector_nr) * fs_info->csum_size; - ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff, - csum_buf, csum_expected); + kaddr = kmap_local_sector(sector); + ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, csum_expected); + kunmap_local(kaddr); return ret; } @@ -1872,9 +1883,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, } else { sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); } - ASSERT(sector->page); - pointers[stripe_nr] = kmap_local_page(sector->page) + - sector->pgoff; + pointers[stripe_nr] = kmap_local_sector(sector); unmap_array[stripe_nr] = pointers[stripe_nr]; } @@ -2282,9 +2291,8 @@ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) static void raid_wait_write_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - blk_status_t err = bio->bi_status; - if (err) + if (bio->bi_status) rbio_update_error_bitmap(rbio, bio); bio_put(bio); if (atomic_dec_and_test(&rbio->stripes_pending)) @@ -2326,7 +2334,7 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) * thus this rbio can not be cached one, as cached one must * have all its data sectors present and uptodate. */ - if (!sector->page || !sector->uptodate) + if (!sector->has_paddr || !sector->uptodate) return true; } return false; @@ -2516,6 +2524,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) int stripe; int sectornr; bool has_qstripe; + struct page *page; struct sector_ptr p_sector = { 0 }; struct sector_ptr q_sector = { 0 }; struct bio_list bio_list; @@ -2547,29 +2556,33 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) */ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - p_sector.page = alloc_page(GFP_NOFS); - if (!p_sector.page) + page = alloc_page(GFP_NOFS); + if (!page) return -ENOMEM; - p_sector.pgoff = 0; + p_sector.has_paddr = true; + p_sector.paddr = page_to_phys(page); p_sector.uptodate = 1; + page = NULL; if (has_qstripe) { /* RAID6, allocate and map temp space for the Q stripe */ - q_sector.page = alloc_page(GFP_NOFS); - if (!q_sector.page) { - __free_page(p_sector.page); - p_sector.page = NULL; + page = alloc_page(GFP_NOFS); + if (!page) { + __free_page(phys_to_page(p_sector.paddr)); + p_sector.has_paddr = false; return -ENOMEM; } - q_sector.pgoff = 0; + q_sector.has_paddr = true; + q_sector.paddr = page_to_phys(page); q_sector.uptodate = 1; - pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); + page = NULL; + pointers[rbio->real_stripes - 1] = kmap_local_sector(&q_sector); } bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); /* Map the parity stripe just once */ - pointers[nr_data] = kmap_local_page(p_sector.page); + pointers[nr_data] = kmap_local_sector(&p_sector); for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { struct sector_ptr *sector; @@ -2578,8 +2591,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) /* first collect one page from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { sector = sector_in_rbio(rbio, stripe, sectornr, 0); - pointers[stripe] = kmap_local_page(sector->page) + - sector->pgoff; + pointers[stripe] = kmap_local_sector(sector); } if (has_qstripe) { @@ -2595,7 +2607,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) /* Check scrubbing parity and repair it */ sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); - parity = kmap_local_page(sector->page) + sector->pgoff; + parity = kmap_local_sector(sector); if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) memcpy(parity, pointers[rbio->scrubp], sectorsize); else @@ -2608,12 +2620,11 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) } kunmap_local(pointers[nr_data]); - __free_page(p_sector.page); - p_sector.page = NULL; - if (q_sector.page) { - kunmap_local(pointers[rbio->real_stripes - 1]); - __free_page(q_sector.page); - q_sector.page = NULL; + __free_page(phys_to_page(p_sector.paddr)); + p_sector.has_paddr = false; + if (q_sector.has_paddr) { + __free_page(phys_to_page(q_sector.paddr)); + q_sector.has_paddr = false; } /* diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h deleted file mode 100644 index 1c2d7cb1fe6f..000000000000 --- a/fs/btrfs/rcu-string.h +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2012 Red Hat. All rights reserved. - */ - -#ifndef BTRFS_RCU_STRING_H -#define BTRFS_RCU_STRING_H - -#include <linux/types.h> -#include <linux/string.h> -#include <linux/slab.h> -#include <linux/rcupdate.h> -#include <linux/printk.h> - -struct rcu_string { - struct rcu_head rcu; - char str[]; -}; - -static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask) -{ - size_t len = strlen(src) + 1; - struct rcu_string *ret = kzalloc(sizeof(struct rcu_string) + - (len * sizeof(char)), mask); - if (!ret) - return ret; - /* Warn if the source got unexpectedly truncated. */ - if (WARN_ON(strscpy(ret->str, src, len) < 0)) { - kfree(ret); - return NULL; - } - return ret; -} - -static inline void rcu_string_free(struct rcu_string *str) -{ - if (str) - kfree_rcu(str, rcu); -} - -#define printk_in_rcu(fmt, ...) do { \ - rcu_read_lock(); \ - printk(fmt, __VA_ARGS__); \ - rcu_read_unlock(); \ -} while (0) - -#define printk_ratelimited_in_rcu(fmt, ...) do { \ - rcu_read_lock(); \ - printk_ratelimited(fmt, __VA_ARGS__); \ - rcu_read_unlock(); \ -} while (0) - -#define rcu_str_deref(rcu_str) ({ \ - struct rcu_string *__str = rcu_dereference(rcu_str); \ - __str->str; \ -}) - -#endif diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 9522a8b79d22..3871c3a6c743 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -75,69 +75,70 @@ struct block_entry { struct list_head actions; }; +static int block_entry_bytenr_key_cmp(const void *key, const struct rb_node *node) +{ + const u64 *bytenr = key; + const struct block_entry *entry = rb_entry(node, struct block_entry, node); + + if (entry->bytenr < *bytenr) + return 1; + else if (entry->bytenr > *bytenr) + return -1; + + return 0; +} + +static int block_entry_bytenr_cmp(struct rb_node *new, const struct rb_node *existing) +{ + const struct block_entry *new_entry = rb_entry(new, struct block_entry, node); + + return block_entry_bytenr_key_cmp(&new_entry->bytenr, existing); +} + static struct block_entry *insert_block_entry(struct rb_root *root, struct block_entry *be) { - struct rb_node **p = &root->rb_node; - struct rb_node *parent_node = NULL; - struct block_entry *entry; - - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct block_entry, node); - if (entry->bytenr > be->bytenr) - p = &(*p)->rb_left; - else if (entry->bytenr < be->bytenr) - p = &(*p)->rb_right; - else - return entry; - } + struct rb_node *node; - rb_link_node(&be->node, parent_node, p); - rb_insert_color(&be->node, root); - return NULL; + node = rb_find_add(&be->node, root, block_entry_bytenr_cmp); + return rb_entry_safe(node, struct block_entry, node); } static struct block_entry *lookup_block_entry(struct rb_root *root, u64 bytenr) { - struct rb_node *n; - struct block_entry *entry = NULL; + struct rb_node *node; - n = root->rb_node; - while (n) { - entry = rb_entry(n, struct block_entry, node); - if (entry->bytenr < bytenr) - n = n->rb_right; - else if (entry->bytenr > bytenr) - n = n->rb_left; - else - return entry; - } - return NULL; + node = rb_find(&bytenr, root, block_entry_bytenr_key_cmp); + return rb_entry_safe(node, struct block_entry, node); +} + +static int root_entry_root_objectid_key_cmp(const void *key, const struct rb_node *node) +{ + const u64 *objectid = key; + const struct root_entry *entry = rb_entry(node, struct root_entry, node); + + if (entry->root_objectid < *objectid) + return 1; + else if (entry->root_objectid > *objectid) + return -1; + + return 0; +} + +static int root_entry_root_objectid_cmp(struct rb_node *new, const struct rb_node *existing) +{ + const struct root_entry *new_entry = rb_entry(new, struct root_entry, node); + + return root_entry_root_objectid_key_cmp(&new_entry->root_objectid, existing); } static struct root_entry *insert_root_entry(struct rb_root *root, struct root_entry *re) { - struct rb_node **p = &root->rb_node; - struct rb_node *parent_node = NULL; - struct root_entry *entry; - - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct root_entry, node); - if (entry->root_objectid > re->root_objectid) - p = &(*p)->rb_left; - else if (entry->root_objectid < re->root_objectid) - p = &(*p)->rb_right; - else - return entry; - } - - rb_link_node(&re->node, parent_node, p); - rb_insert_color(&re->node, root); - return NULL; + struct rb_node *node; + node = rb_find_add(&re->node, root, root_entry_root_objectid_cmp); + return rb_entry_safe(node, struct root_entry, node); } static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2) @@ -161,48 +162,29 @@ static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2) return 0; } +static int ref_entry_cmp(struct rb_node *new, const struct rb_node *existing) +{ + struct ref_entry *new_entry = rb_entry(new, struct ref_entry, node); + struct ref_entry *existing_entry = rb_entry(existing, struct ref_entry, node); + + return comp_refs(new_entry, existing_entry); +} + static struct ref_entry *insert_ref_entry(struct rb_root *root, struct ref_entry *ref) { - struct rb_node **p = &root->rb_node; - struct rb_node *parent_node = NULL; - struct ref_entry *entry; - int cmp; - - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct ref_entry, node); - cmp = comp_refs(entry, ref); - if (cmp > 0) - p = &(*p)->rb_left; - else if (cmp < 0) - p = &(*p)->rb_right; - else - return entry; - } - - rb_link_node(&ref->node, parent_node, p); - rb_insert_color(&ref->node, root); - return NULL; + struct rb_node *node; + node = rb_find_add(&ref->node, root, ref_entry_cmp); + return rb_entry_safe(node, struct ref_entry, node); } static struct root_entry *lookup_root_entry(struct rb_root *root, u64 objectid) { - struct rb_node *n; - struct root_entry *entry = NULL; + struct rb_node *node; - n = root->rb_node; - while (n) { - entry = rb_entry(n, struct root_entry, node); - if (entry->root_objectid < objectid) - n = n->rb_right; - else if (entry->root_objectid > objectid) - n = n->rb_left; - else - return entry; - } - return NULL; + node = rb_find(&objectid, root, root_entry_root_objectid_key_cmp); + return rb_entry_safe(node, struct root_entry, node); } #ifdef CONFIG_STACKTRACE @@ -668,7 +650,7 @@ static void dump_block_entry(struct btrfs_fs_info *fs_info, * our sanity checks pass as they are no longer needed. */ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, - struct btrfs_ref *generic_ref) + const struct btrfs_ref *generic_ref) { struct ref_entry *ref = NULL, *exist; struct ref_action *ra = NULL; @@ -857,6 +839,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, "dropping a ref for a root that doesn't have a ref on the block"); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); + rb_erase(&ref->node, &be->refs); kfree(ref); kfree(ra); goto out_unlock; diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h index 3511e1a5c96b..559bd25a2b7a 100644 --- a/fs/btrfs/ref-verify.h +++ b/fs/btrfs/ref-verify.h @@ -19,7 +19,7 @@ struct btrfs_ref; int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info); void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info); int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, - struct btrfs_ref *generic_ref); + const struct btrfs_ref *generic_ref); void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, u64 len); @@ -39,7 +39,7 @@ static inline void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info) } static inline int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, - struct btrfs_ref *generic_ref) + const struct btrfs_ref *generic_ref) { return 0; } diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index f0824c948cb7..ce25ab7f0e99 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -46,11 +46,9 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); - goto out; + return ret; } - ret = btrfs_end_transaction(trans); -out: - return ret; + return btrfs_end_transaction(trans); } static int copy_inline_to_page(struct btrfs_inode *inode, @@ -87,7 +85,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, btrfs_alloc_write_mask(mapping)); if (IS_ERR(folio)) { - ret = -ENOMEM; + ret = PTR_ERR(folio); goto out_unlock; } @@ -95,9 +93,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode, if (ret < 0) goto out_unlock; - clear_extent_bit(&inode->io_tree, file_offset, range_end, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - NULL); + btrfs_clear_extent_bit(&inode->io_tree, file_offset, range_end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, NULL); ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); if (ret) goto out_unlock; @@ -165,7 +162,7 @@ out: * the source inode to destination inode when possible. When not possible we * copy the inline extent's data into the respective page of the inode. */ -static int clone_copy_inline_extent(struct inode *dst, +static int clone_copy_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_key *new_key, const u64 drop_start, @@ -175,8 +172,8 @@ static int clone_copy_inline_extent(struct inode *dst, char *inline_data, struct btrfs_trans_handle **trans_out) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(dst); - struct btrfs_root *root = BTRFS_I(dst)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; const u64 aligned_end = ALIGN(new_key->offset + datal, fs_info->sectorsize); struct btrfs_trans_handle *trans = NULL; @@ -185,12 +182,12 @@ static int clone_copy_inline_extent(struct inode *dst, struct btrfs_key key; if (new_key->offset > 0) { - ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + ret = copy_inline_to_page(inode, new_key->offset, inline_data, size, datal, comp_type); goto out; } - key.objectid = btrfs_ino(BTRFS_I(dst)); + key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -205,7 +202,7 @@ static int clone_copy_inline_extent(struct inode *dst, goto copy_inline_extent; } btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == btrfs_ino(BTRFS_I(dst)) && + if (key.objectid == btrfs_ino(inode) && key.type == BTRFS_EXTENT_DATA_KEY) { /* * There's an implicit hole at file offset 0, copy the @@ -214,7 +211,7 @@ static int clone_copy_inline_extent(struct inode *dst, ASSERT(key.offset > 0); goto copy_to_page; } - } else if (i_size_read(dst) <= datal) { + } else if (i_size_read(&inode->vfs_inode) <= datal) { struct btrfs_file_extent_item *ei; ei = btrfs_item_ptr(path->nodes[0], path->slots[0], @@ -236,7 +233,7 @@ copy_inline_extent: * We have no extent items, or we have an extent at offset 0 which may * or may not be inlined. All these cases are dealt the same way. */ - if (i_size_read(dst) > datal) { + if (i_size_read(&inode->vfs_inode) > datal) { /* * At the destination offset 0 we have either a hole, a regular * extent or an inline extent larger then the one we want to @@ -270,20 +267,26 @@ copy_inline_extent: drop_args.start = drop_start; drop_args.end = aligned_end; drop_args.drop_cache = true; - ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args); - if (ret) + ret = btrfs_drop_extents(trans, root, inode, &drop_args); + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } ret = btrfs_insert_empty_item(trans, root, path, new_key, size); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } write_extent_buffer(path->nodes[0], inline_data, btrfs_item_ptr_offset(path->nodes[0], path->slots[0]), size); - btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found); - btrfs_set_inode_full_sync(BTRFS_I(dst)); - ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end); + btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found); + btrfs_set_inode_full_sync(inode); + ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end); + if (ret) + btrfs_abort_transaction(trans, ret); out: if (!ret && !trans) { /* @@ -298,10 +301,8 @@ out: trans = NULL; } } - if (ret && trans) { - btrfs_abort_transaction(trans, ret); + if (ret && trans) btrfs_end_transaction(trans); - } if (!ret) *trans_out = trans; @@ -318,7 +319,7 @@ copy_to_page: */ btrfs_release_path(path); - ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + ret = copy_inline_to_page(inode, new_key->offset, inline_data, size, datal, comp_type); goto out; } @@ -526,7 +527,7 @@ process_slot: goto out; } - ret = clone_copy_inline_extent(inode, path, &new_key, + ret = clone_copy_inline_extent(BTRFS_I(inode), path, &new_key, drop_start, datal, size, comp, buf, &trans); if (ret) @@ -617,26 +618,26 @@ out: return ret; } -static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2) +static void btrfs_double_mmap_lock(struct btrfs_inode *inode1, struct btrfs_inode *inode2) { if (inode1 < inode2) swap(inode1, inode2); - down_write(&BTRFS_I(inode1)->i_mmap_lock); - down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING); + down_write(&inode1->i_mmap_lock); + down_write_nested(&inode2->i_mmap_lock, SINGLE_DEPTH_NESTING); } -static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2) +static void btrfs_double_mmap_unlock(struct btrfs_inode *inode1, struct btrfs_inode *inode2) { - up_write(&BTRFS_I(inode1)->i_mmap_lock); - up_write(&BTRFS_I(inode2)->i_mmap_lock); + up_write(&inode1->i_mmap_lock); + up_write(&inode2->i_mmap_lock); } -static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, - struct inode *dst, u64 dst_loff) +static int btrfs_extent_same_range(struct btrfs_inode *src, u64 loff, u64 len, + struct btrfs_inode *dst, u64 dst_loff) { const u64 end = dst_loff + len - 1; struct extent_state *cached_state = NULL; - struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info; + struct btrfs_fs_info *fs_info = src->root->fs_info; const u64 bs = fs_info->sectorsize; int ret; @@ -646,9 +647,10 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, * because we have already locked the inode's i_mmap_lock in exclusive * mode. */ - lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state); - ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); - unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state); + btrfs_lock_extent(&dst->io_tree, dst_loff, end, &cached_state); + ret = btrfs_clone(&src->vfs_inode, &dst->vfs_inode, loff, len, + ALIGN(len, bs), dst_loff, 1); + btrfs_unlock_extent(&dst->io_tree, dst_loff, end, &cached_state); btrfs_btree_balance_dirty(fs_info); @@ -678,8 +680,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); for (i = 0; i < chunk_count; i++) { - ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, - dst, dst_loff); + ret = btrfs_extent_same_range(BTRFS_I(src), loff, BTRFS_MAX_DEDUPE_LEN, + BTRFS_I(dst), dst_loff); if (ret) goto out; @@ -688,7 +690,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, } if (tail_len > 0) - ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff); + ret = btrfs_extent_same_range(BTRFS_I(src), loff, tail_len, + BTRFS_I(dst), dst_loff); out: spin_lock(&root_dst->root_item_lock); root_dst->dedupe_in_progress--; @@ -747,9 +750,9 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, * mode. */ end = destoff + len - 1; - lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); + btrfs_lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); - unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); /* * We may have copied an inline extent into a page of the destination @@ -775,24 +778,24 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags) { - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - u64 bs = BTRFS_I(inode_out)->root->fs_info->sectorsize; + struct btrfs_inode *inode_in = BTRFS_I(file_inode(file_in)); + struct btrfs_inode *inode_out = BTRFS_I(file_inode(file_out)); + u64 bs = inode_out->root->fs_info->sectorsize; u64 wb_len; int ret; if (!(remap_flags & REMAP_FILE_DEDUP)) { - struct btrfs_root *root_out = BTRFS_I(inode_out)->root; + struct btrfs_root *root_out = inode_out->root; if (btrfs_root_readonly(root_out)) return -EROFS; - ASSERT(inode_in->i_sb == inode_out->i_sb); + ASSERT(inode_in->vfs_inode.i_sb == inode_out->vfs_inode.i_sb); } /* Don't make the dst file partly checksummed */ - if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { + if ((inode_in->flags & BTRFS_INODE_NODATASUM) != + (inode_out->flags & BTRFS_INODE_NODATASUM)) { return -EINVAL; } @@ -811,7 +814,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, * to complete so that new file extent items are in the fs tree. */ if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) - wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); + wb_len = ALIGN(inode_in->vfs_inode.i_size, bs) - ALIGN_DOWN(pos_in, bs); else wb_len = ALIGN(*len, bs); @@ -832,16 +835,14 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, * Also we don't need to check ASYNC_EXTENT, as async extent will be * CoWed anyway, not affecting nocow part. */ - ret = filemap_flush(inode_in->i_mapping); + ret = filemap_flush(inode_in->vfs_inode.i_mapping); if (ret < 0) return ret; - ret = btrfs_wait_ordered_range(BTRFS_I(inode_in), ALIGN_DOWN(pos_in, bs), - wb_len); + ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), wb_len); if (ret < 0) return ret; - ret = btrfs_wait_ordered_range(BTRFS_I(inode_out), ALIGN_DOWN(pos_out, bs), - wb_len); + ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), wb_len); if (ret < 0) return ret; @@ -863,8 +864,8 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, loff_t len, unsigned int remap_flags) { - struct inode *src_inode = file_inode(src_file); - struct inode *dst_inode = file_inode(dst_file); + struct btrfs_inode *src_inode = BTRFS_I(file_inode(src_file)); + struct btrfs_inode *dst_inode = BTRFS_I(file_inode(dst_file)); bool same_inode = dst_inode == src_inode; int ret; @@ -872,9 +873,9 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, return -EINVAL; if (same_inode) { - btrfs_inode_lock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); + btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP); } else { - lock_two_nondirectories(src_inode, dst_inode); + lock_two_nondirectories(&src_inode->vfs_inode, &dst_inode->vfs_inode); btrfs_double_mmap_lock(src_inode, dst_inode); } @@ -884,16 +885,18 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, goto out_unlock; if (remap_flags & REMAP_FILE_DEDUP) - ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); + ret = btrfs_extent_same(&src_inode->vfs_inode, off, len, + &dst_inode->vfs_inode, destoff); else ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); out_unlock: if (same_inode) { - btrfs_inode_unlock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP); } else { btrfs_double_mmap_unlock(src_inode, dst_inode); - unlock_two_nondirectories(src_inode, dst_inode); + unlock_two_nondirectories(&src_inode->vfs_inode, + &dst_inode->vfs_inode); } /* diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index bf267bdfa8f8..e58151933844 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -90,10 +90,15 @@ * map address of tree root to tree */ struct mapping_node { - struct { - struct rb_node rb_node; - u64 bytenr; - }; /* Use rb_simle_node for search/insert */ + union { + /* Use rb_simple_node for search/insert */ + struct { + struct rb_node rb_node; + u64 bytenr; + }; + + struct rb_simple_node simple_node; + }; void *data; }; @@ -106,10 +111,15 @@ struct mapping_tree { * present a tree block to process */ struct tree_block { - struct { - struct rb_node rb_node; - u64 bytenr; - }; /* Use rb_simple_node for search/insert */ + union { + /* Use rb_simple_node for search/insert */ + struct { + struct rb_node rb_node; + u64 bytenr; + }; + + struct rb_simple_node simple_node; + }; u64 owner; struct btrfs_key key; u8 level; @@ -178,8 +188,9 @@ static void mark_block_processed(struct reloc_control *rc, in_range(node->bytenr, rc->block_group->start, rc->block_group->length)) { blocksize = rc->extent_root->fs_info->nodesize; - set_extent_bit(&rc->processed_blocks, node->bytenr, - node->bytenr + blocksize - 1, EXTENT_DIRTY, NULL); + btrfs_set_extent_bit(&rc->processed_blocks, node->bytenr, + node->bytenr + blocksize - 1, EXTENT_DIRTY, + NULL); } node->processed = 1; } @@ -195,8 +206,8 @@ static struct btrfs_backref_node *walk_up_backref( int idx = *index; while (!list_empty(&node->upper)) { - edge = list_entry(node->upper.next, - struct btrfs_backref_edge, list[LOWER]); + edge = list_first_entry(&node->upper, struct btrfs_backref_edge, + list[LOWER]); edges[idx++] = edge; node = edge->node[UPPER]; } @@ -222,8 +233,8 @@ static struct btrfs_backref_node *walk_down_backref( idx--; continue; } - edge = list_entry(edge->list[LOWER].next, - struct btrfs_backref_edge, list[LOWER]); + edge = list_first_entry(&edge->list[LOWER], struct btrfs_backref_edge, + list[LOWER]); edges[idx - 1] = edge; *index = idx; return edge->node[UPPER]; @@ -342,19 +353,13 @@ static bool handle_useless_nodes(struct reloc_control *rc, if (cur == node) ret = true; - /* The node is the lowest node */ - if (cur->lowest) { - list_del_init(&cur->lower); - cur->lowest = 0; - } - /* Cleanup the lower edges */ while (!list_empty(&cur->lower)) { struct btrfs_backref_edge *edge; struct btrfs_backref_node *lower; - edge = list_entry(cur->lower.next, - struct btrfs_backref_edge, list[UPPER]); + edge = list_first_entry(&cur->lower, struct btrfs_backref_edge, + list[UPPER]); list_del(&edge->list[UPPER]); list_del(&edge->list[LOWER]); lower = edge->node[LOWER]; @@ -373,7 +378,6 @@ static bool handle_useless_nodes(struct reloc_control *rc, * cache to avoid unnecessary backref lookup. */ if (cur->level > 0) { - list_add(&cur->list, &cache->detached); cur->detached = 1; } else { rb_erase(&cur->rb_node, &cache->rb_root); @@ -426,7 +430,6 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( goto out; } - node->lowest = 1; cur = node; /* Breadth-first search to build backref cache */ @@ -470,92 +473,6 @@ out: } /* - * helper to add backref node for the newly created snapshot. - * the backref node is created by cloning backref node that - * corresponds to root of source tree - */ -static int clone_backref_node(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - const struct btrfs_root *src, - struct btrfs_root *dest) -{ - struct btrfs_root *reloc_root = src->reloc_root; - struct btrfs_backref_cache *cache = &rc->backref_cache; - struct btrfs_backref_node *node = NULL; - struct btrfs_backref_node *new_node; - struct btrfs_backref_edge *edge; - struct btrfs_backref_edge *new_edge; - struct rb_node *rb_node; - - rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start); - if (rb_node) { - node = rb_entry(rb_node, struct btrfs_backref_node, rb_node); - if (node->detached) - node = NULL; - else - BUG_ON(node->new_bytenr != reloc_root->node->start); - } - - if (!node) { - rb_node = rb_simple_search(&cache->rb_root, - reloc_root->commit_root->start); - if (rb_node) { - node = rb_entry(rb_node, struct btrfs_backref_node, - rb_node); - BUG_ON(node->detached); - } - } - - if (!node) - return 0; - - new_node = btrfs_backref_alloc_node(cache, dest->node->start, - node->level); - if (!new_node) - return -ENOMEM; - - new_node->lowest = node->lowest; - new_node->checked = 1; - new_node->root = btrfs_grab_root(dest); - ASSERT(new_node->root); - - if (!node->lowest) { - list_for_each_entry(edge, &node->lower, list[UPPER]) { - new_edge = btrfs_backref_alloc_edge(cache); - if (!new_edge) - goto fail; - - btrfs_backref_link_edge(new_edge, edge->node[LOWER], - new_node, LINK_UPPER); - } - } else { - list_add_tail(&new_node->lower, &cache->leaves); - } - - rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr, - &new_node->rb_node); - if (rb_node) - btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST); - - if (!new_node->lowest) { - list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { - list_add_tail(&new_edge->list[LOWER], - &new_edge->node[LOWER]->upper); - } - } - return 0; -fail: - while (!list_empty(&new_node->lower)) { - new_edge = list_entry(new_node->lower.next, - struct btrfs_backref_edge, list[UPPER]); - list_del(&new_edge->list[UPPER]); - btrfs_backref_free_edge(cache, new_edge); - } - btrfs_backref_free_node(cache, new_node); - return -ENOMEM; -} - -/* * helper to add 'address of tree root -> reloc tree' mapping */ static int __add_reloc_root(struct btrfs_root *root) @@ -573,8 +490,7 @@ static int __add_reloc_root(struct btrfs_root *root) node->data = root; spin_lock(&rc->reloc_root_tree.lock); - rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, - node->bytenr, &node->rb_node); + rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, &node->simple_node); spin_unlock(&rc->reloc_root_tree.lock); if (rb_node) { btrfs_err(fs_info, @@ -657,8 +573,7 @@ static int __update_reloc_root(struct btrfs_root *root) spin_lock(&rc->reloc_root_tree.lock); node->bytenr = root->node->start; - rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, - node->bytenr, &node->rb_node); + rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, &node->simple_node); spin_unlock(&rc->reloc_root_tree.lock); if (rb_node) btrfs_backref_panic(fs_info, node->bytenr, -EEXIST); @@ -950,7 +865,6 @@ int replace_file_extents(struct btrfs_trans_handle *trans, u32 i; int ret = 0; int first = 1; - int dirty = 0; if (rc->stage != UPDATE_DATA_PTRS) return 0; @@ -1005,16 +919,16 @@ int replace_file_extents(struct btrfs_trans_handle *trans, /* Take mmap lock to serialize with reflinks. */ if (!down_read_trylock(&inode->i_mmap_lock)) continue; - ret = try_lock_extent(&inode->io_tree, key.offset, - end, &cached_state); + ret = btrfs_try_lock_extent(&inode->io_tree, key.offset, + end, &cached_state); if (!ret) { up_read(&inode->i_mmap_lock); continue; } btrfs_drop_extent_map_range(inode, key.offset, end, true); - unlock_extent(&inode->io_tree, key.offset, end, - &cached_state); + btrfs_unlock_extent(&inode->io_tree, key.offset, end, + &cached_state); up_read(&inode->i_mmap_lock); } } @@ -1030,7 +944,6 @@ int replace_file_extents(struct btrfs_trans_handle *trans, } btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); - dirty = 1; key.offset -= btrfs_file_extent_offset(leaf, fi); ref.action = BTRFS_ADD_DELAYED_REF; @@ -1061,8 +974,6 @@ int replace_file_extents(struct btrfs_trans_handle *trans, break; } } - if (dirty) - btrfs_mark_buffer_dirty(trans, leaf); if (inode) btrfs_add_delayed_iput(inode); return ret; @@ -1255,13 +1166,11 @@ again: */ btrfs_set_node_blockptr(parent, slot, new_bytenr); btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); - btrfs_mark_buffer_dirty(trans, parent); btrfs_set_node_blockptr(path->nodes[level], path->slots[level], old_bytenr); btrfs_set_node_ptr_generation(path->nodes[level], path->slots[level], old_ptr_gen); - btrfs_mark_buffer_dirty(trans, path->nodes[level]); ref.action = BTRFS_ADD_DELAYED_REF; ref.bytenr = old_bytenr; @@ -1478,9 +1387,9 @@ static int invalidate_extent_cache(struct btrfs_root *root, } /* the lock_extent waits for read_folio to complete */ - lock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_lock_extent(&inode->io_tree, start, end, &cached_state); btrfs_drop_extent_map_range(inode, start, end, true); - unlock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); } return 0; } @@ -1615,7 +1524,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_root_level(root_item); - atomic_inc(&reloc_root->node->refs); + refcount_inc(&reloc_root->node->refs); path->nodes[level] = reloc_root->node; path->slots[level] = 0; } else { @@ -1797,8 +1706,8 @@ again: rc->merge_reloc_tree = true; while (!list_empty(&rc->reloc_roots)) { - reloc_root = list_entry(rc->reloc_roots.next, - struct btrfs_root, root_list); + reloc_root = list_first_entry(&rc->reloc_roots, + struct btrfs_root, root_list); list_del_init(&reloc_root->root_list); root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, @@ -1913,8 +1822,7 @@ again: while (!list_empty(&reloc_roots)) { found = 1; - reloc_root = list_entry(reloc_roots.next, - struct btrfs_root, root_list); + reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list); root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); @@ -2030,11 +1938,11 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, * reloc root without a corresponding root this could return ENOENT. */ if (IS_ERR(root)) { - ASSERT(0); + DEBUG_WARN("error %ld reading root for reloc root", PTR_ERR(root)); return PTR_ERR(root); } if (root->reloc_root != reloc_root) { - ASSERT(0); + DEBUG_WARN("unexpected reloc root found"); btrfs_err(fs_info, "root %llu has two reloc roots associated with it", reloc_root->root_key.offset); @@ -2058,100 +1966,72 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, int index = 0; int ret; - next = node; - while (1) { - cond_resched(); - next = walk_up_backref(next, edges, &index); - root = next->root; + next = walk_up_backref(node, edges, &index); + root = next->root; - /* - * If there is no root, then our references for this block are - * incomplete, as we should be able to walk all the way up to a - * block that is owned by a root. - * - * This path is only for SHAREABLE roots, so if we come upon a - * non-SHAREABLE root then we have backrefs that resolve - * improperly. - * - * Both of these cases indicate file system corruption, or a bug - * in the backref walking code. - */ - if (!root) { - ASSERT(0); - btrfs_err(trans->fs_info, - "bytenr %llu doesn't have a backref path ending in a root", - node->bytenr); - return ERR_PTR(-EUCLEAN); - } - if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { - ASSERT(0); - btrfs_err(trans->fs_info, - "bytenr %llu has multiple refs with one ending in a non-shareable root", - node->bytenr); - return ERR_PTR(-EUCLEAN); - } - - if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { - ret = record_reloc_root_in_trans(trans, root); - if (ret) - return ERR_PTR(ret); - break; - } + /* + * If there is no root, then our references for this block are + * incomplete, as we should be able to walk all the way up to a block + * that is owned by a root. + * + * This path is only for SHAREABLE roots, so if we come upon a + * non-SHAREABLE root then we have backrefs that resolve improperly. + * + * Both of these cases indicate file system corruption, or a bug in the + * backref walking code. + */ + if (unlikely(!root)) { + btrfs_err(trans->fs_info, + "bytenr %llu doesn't have a backref path ending in a root", + node->bytenr); + return ERR_PTR(-EUCLEAN); + } + if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) { + btrfs_err(trans->fs_info, + "bytenr %llu has multiple refs with one ending in a non-shareable root", + node->bytenr); + return ERR_PTR(-EUCLEAN); + } - ret = btrfs_record_root_in_trans(trans, root); + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { + ret = record_reloc_root_in_trans(trans, root); if (ret) return ERR_PTR(ret); - root = root->reloc_root; - - /* - * We could have raced with another thread which failed, so - * root->reloc_root may not be set, return ENOENT in this case. - */ - if (!root) - return ERR_PTR(-ENOENT); + goto found; + } - if (next->new_bytenr != root->node->start) { - /* - * We just created the reloc root, so we shouldn't have - * ->new_bytenr set and this shouldn't be in the changed - * list. If it is then we have multiple roots pointing - * at the same bytenr which indicates corruption, or - * we've made a mistake in the backref walking code. - */ - ASSERT(next->new_bytenr == 0); - ASSERT(list_empty(&next->list)); - if (next->new_bytenr || !list_empty(&next->list)) { - btrfs_err(trans->fs_info, - "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", - node->bytenr, next->bytenr); - return ERR_PTR(-EUCLEAN); - } + ret = btrfs_record_root_in_trans(trans, root); + if (ret) + return ERR_PTR(ret); + root = root->reloc_root; - next->new_bytenr = root->node->start; - btrfs_put_root(next->root); - next->root = btrfs_grab_root(root); - ASSERT(next->root); - list_add_tail(&next->list, - &rc->backref_cache.changed); - mark_block_processed(rc, next); - break; - } + /* + * We could have raced with another thread which failed, so + * root->reloc_root may not be set, return ENOENT in this case. + */ + if (!root) + return ERR_PTR(-ENOENT); - WARN_ON(1); - root = NULL; - next = walk_down_backref(edges, &index); - if (!next || next->level <= node->level) - break; - } - if (!root) { + if (next->new_bytenr) { /* - * This can happen if there's fs corruption or if there's a bug - * in the backref lookup code. + * We just created the reloc root, so we shouldn't have + * ->new_bytenr set yet. If it is then we have multiple roots + * pointing at the same bytenr which indicates corruption, or + * we've made a mistake in the backref walking code. */ - ASSERT(0); - return ERR_PTR(-ENOENT); + ASSERT(next->new_bytenr == 0); + btrfs_err(trans->fs_info, + "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", + node->bytenr, next->bytenr); + return ERR_PTR(-EUCLEAN); } + next->new_bytenr = root->node->start; + btrfs_put_root(next->root); + next->root = btrfs_grab_root(root); + ASSERT(next->root); + mark_block_processed(rc, next); +found: next = node; /* setup backref node path for btrfs_reloc_cow_block */ while (1) { @@ -2237,8 +2117,8 @@ static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc, if (list_empty(&next->upper)) break; - edge = list_entry(next->upper.next, - struct btrfs_backref_edge, list[LOWER]); + edge = list_first_entry(&next->upper, struct btrfs_backref_edge, + list[LOWER]); edges[index++] = edge; next = edge->node[UPPER]; } @@ -2247,17 +2127,11 @@ static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc, return num_bytes; } -static int reserve_metadata_space(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct btrfs_backref_node *node) +static int refill_metadata_space(struct btrfs_trans_handle *trans, + struct reloc_control *rc, u64 num_bytes) { - struct btrfs_root *root = rc->extent_root; - struct btrfs_fs_info *fs_info = root->fs_info; - u64 num_bytes; + struct btrfs_fs_info *fs_info = trans->fs_info; int ret; - u64 tmp; - - num_bytes = calcu_metadata_size(rc, node) * 2; trans->block_rsv = rc->block_rsv; rc->reserved_bytes += num_bytes; @@ -2270,7 +2144,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes, BTRFS_RESERVE_FLUSH_LIMIT); if (ret) { - tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; + u64 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; + while (tmp <= rc->reserved_bytes) tmp <<= 1; /* @@ -2288,6 +2163,16 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, return 0; } +static int reserve_metadata_space(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_backref_node *node) +{ + u64 num_bytes; + + num_bytes = calcu_metadata_size(rc, node) * 2; + return refill_metadata_space(trans, rc, num_bytes); +} + /* * relocate a block tree, and then update pointers in upper level * blocks that reference the block to point to the new location. @@ -2442,7 +2327,7 @@ next: if (!ret && node->pending) { btrfs_backref_drop_node_buffer(node); - list_move_tail(&node->list, &rc->backref_cache.changed); + list_del_init(&node->list); node->pending = 0; } @@ -2479,8 +2364,8 @@ static int finish_pending_nodes(struct btrfs_trans_handle *trans, for (level = 0; level < BTRFS_MAX_LEVEL; level++) { while (!list_empty(&cache->pending[level])) { - node = list_entry(cache->pending[level].next, - struct btrfs_backref_node, list); + node = list_first_entry(&cache->pending[level], + struct btrfs_backref_node, list); list_move_tail(&node->list, &list); BUG_ON(!node->pending); @@ -2518,8 +2403,8 @@ static void update_processed_blocks(struct reloc_control *rc, if (list_empty(&next->upper)) break; - edge = list_entry(next->upper.next, - struct btrfs_backref_edge, list[LOWER]); + edge = list_first_entry(&next->upper, struct btrfs_backref_edge, + list[LOWER]); edges[index++] = edge; next = edge->node[UPPER]; } @@ -2531,8 +2416,8 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc) { u32 blocksize = rc->extent_root->fs_info->nodesize; - if (test_range_bit(&rc->processed_blocks, bytenr, - bytenr + blocksize - 1, EXTENT_DIRTY, NULL)) + if (btrfs_test_range_bit(&rc->processed_blocks, bytenr, + bytenr + blocksize - 1, EXTENT_DIRTY, NULL)) return 1; return 0; } @@ -2605,8 +2490,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, /* * This block was the root block of a root, and this is * the first time we're processing the block and thus it - * should not have had the ->new_bytenr modified and - * should have not been included on the changed list. + * should not have had the ->new_bytenr modified. * * However in the case of corruption we could have * multiple refs pointing to the same block improperly, @@ -2616,8 +2500,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, * normal user in the case of corruption. */ ASSERT(node->new_bytenr == 0); - ASSERT(list_empty(&node->list)); - if (node->new_bytenr || !list_empty(&node->list)) { + if (node->new_bytenr) { btrfs_err(root->fs_info, "bytenr %llu has improper references to it", node->bytenr); @@ -2640,17 +2523,12 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, btrfs_put_root(node->root); node->root = btrfs_grab_root(root); ASSERT(node->root); - list_add_tail(&node->list, &rc->backref_cache.changed); } else { - path->lowest_level = node->level; - if (root == root->fs_info->chunk_root) - btrfs_reserve_chunk_metadata(trans, false); - ret = btrfs_search_slot(trans, root, key, path, 0, 1); - btrfs_release_path(path); - if (root == root->fs_info->chunk_root) - btrfs_trans_release_chunk_metadata(trans); - if (ret > 0) - ret = 0; + btrfs_err(root->fs_info, + "bytenr %llu resolved to a non-shareable root", + node->bytenr); + ret = -EUCLEAN; + goto out; } if (!ret) update_processed_blocks(rc, node); @@ -2658,11 +2536,50 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, ret = do_relocation(trans, rc, node, key, path, 1); } out: - if (ret || node->level == 0 || node->cowonly) + if (ret || node->level == 0) btrfs_backref_cleanup_node(&rc->backref_cache, node); return ret; } +static int relocate_cowonly_block(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct tree_block *block, + struct btrfs_path *path) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root; + u64 num_bytes; + int nr_levels; + int ret; + + root = btrfs_get_fs_root(fs_info, block->owner, true); + if (IS_ERR(root)) + return PTR_ERR(root); + + nr_levels = max(btrfs_header_level(root->node) - block->level, 0) + 1; + + num_bytes = fs_info->nodesize * nr_levels; + ret = refill_metadata_space(trans, rc, num_bytes); + if (ret) { + btrfs_put_root(root); + return ret; + } + path->lowest_level = block->level; + if (root == root->fs_info->chunk_root) + btrfs_reserve_chunk_metadata(trans, false); + + ret = btrfs_search_slot(trans, root, &block->key, path, 0, 1); + path->lowest_level = 0; + btrfs_release_path(path); + + if (root == root->fs_info->chunk_root) + btrfs_trans_release_chunk_metadata(trans); + if (ret > 0) + ret = 0; + btrfs_put_root(root); + + return ret; +} + /* * relocate a list of blocks */ @@ -2702,6 +2619,20 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, /* Do tree relocation */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { + /* + * For COWonly blocks, or the data reloc tree, we only need to + * COW down to the block, there's no need to generate a backref + * tree. + */ + if (block->owner && + (!btrfs_is_fstree(block->owner) || + block->owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) { + ret = relocate_cowonly_block(trans, rc, block, path); + if (ret) + break; + continue; + } + node = build_backref_tree(trans, rc, &block->key, block->level, block->bytenr); if (IS_ERR(node)) { @@ -2735,69 +2666,24 @@ static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control u64 num_bytes; int nr; int ret = 0; - u64 i_size = i_size_read(&inode->vfs_inode); u64 prealloc_start = cluster->start - offset; u64 prealloc_end = cluster->end - offset; u64 cur_offset = prealloc_start; /* - * For subpage case, previous i_size may not be aligned to PAGE_SIZE. - * This means the range [i_size, PAGE_END + 1) is filled with zeros by - * btrfs_do_readpage() call of previously relocated file cluster. + * For blocksize < folio size case (either bs < page size or large folios), + * beyond i_size, all blocks are filled with zero. * - * If the current cluster starts in the above range, btrfs_do_readpage() + * If the current cluster covers the above range, btrfs_do_readpage() * will skip the read, and relocate_one_folio() will later writeback * the padding zeros as new data, causing data corruption. * - * Here we have to manually invalidate the range (i_size, PAGE_END + 1). + * Here we have to invalidate the cache covering our cluster. */ - if (!PAGE_ALIGNED(i_size)) { - struct address_space *mapping = inode->vfs_inode.i_mapping; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - const u32 sectorsize = fs_info->sectorsize; - struct folio *folio; - - ASSERT(sectorsize < PAGE_SIZE); - ASSERT(IS_ALIGNED(i_size, sectorsize)); - - /* - * Subpage can't handle page with DIRTY but without UPTODATE - * bit as it can lead to the following deadlock: - * - * btrfs_read_folio() - * | Page already *locked* - * |- btrfs_lock_and_flush_ordered_range() - * |- btrfs_start_ordered_extent() - * |- extent_write_cache_pages() - * |- lock_page() - * We try to lock the page we already hold. - * - * Here we just writeback the whole data reloc inode, so that - * we will be ensured to have no dirty range in the page, and - * are safe to clear the uptodate bits. - * - * This shouldn't cause too much overhead, as we need to write - * the data back anyway. - */ - ret = filemap_write_and_wait(mapping); - if (ret < 0) - return ret; - - clear_extent_bits(&inode->io_tree, i_size, - round_up(i_size, PAGE_SIZE) - 1, - EXTENT_UPTODATE); - folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT); - /* - * If page is freed we don't need to do anything then, as we - * will re-read the whole page anyway. - */ - if (!IS_ERR(folio)) { - btrfs_subpage_clear_uptodate(fs_info, folio, i_size, - round_up(i_size, PAGE_SIZE) - i_size); - folio_unlock(folio); - folio_put(folio); - } - } + ret = filemap_invalidate_inode(&inode->vfs_inode, true, prealloc_start, + prealloc_end); + if (ret < 0) + return ret; BUG_ON(cluster->start != cluster->boundary[0]); ret = btrfs_alloc_data_chunk_ondemand(inode, @@ -2815,21 +2701,21 @@ static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control else end = cluster->end - offset; - lock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_lock_extent(&inode->io_tree, start, end, &cached_state); num_bytes = end + 1 - start; ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); cur_offset = end + 1; - unlock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); if (ret) break; } btrfs_inode_unlock(inode, 0); if (cur_offset < prealloc_end) - btrfs_free_reserved_data_space_noquota(inode->root->fs_info, - prealloc_end + 1 - cur_offset); + btrfs_free_reserved_data_space_noquota(inode, + prealloc_end + 1 - cur_offset); return ret; } @@ -2843,7 +2729,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_contr u64 end = rc->cluster.end - offset; int ret = 0; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) return -ENOMEM; @@ -2854,10 +2740,10 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_contr em->ram_bytes = em->len; em->flags |= EXTENT_FLAG_PINNED; - lock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_lock_extent(&inode->io_tree, start, end, &cached_state); ret = btrfs_replace_extent_map_range(inode, em, false); - unlock_extent(&inode->io_tree, start, end, &cached_state); - free_extent_map(em); + btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_free_extent_map(em); return ret; } @@ -2886,13 +2772,15 @@ static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster, static int relocate_one_folio(struct reloc_control *rc, struct file_ra_state *ra, - int *cluster_nr, unsigned long index) + int *cluster_nr, u64 *file_offset_ret) { const struct file_extent_cluster *cluster = &rc->cluster; struct inode *inode = rc->data_inode; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + const u64 orig_file_offset = *file_offset_ret; u64 offset = BTRFS_I(inode)->reloc_block_group_start; - const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT; + const pgoff_t last_index = (cluster->end - offset) >> PAGE_SHIFT; + const pgoff_t index = orig_file_offset >> PAGE_SHIFT; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); struct folio *folio; u64 folio_start; @@ -2902,6 +2790,7 @@ static int relocate_one_folio(struct reloc_control *rc, const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags); ASSERT(index <= last_index); +again: folio = filemap_lock_folio(inode->i_mapping, index); if (IS_ERR(folio)) { @@ -2924,8 +2813,6 @@ static int relocate_one_folio(struct reloc_control *rc, return PTR_ERR(folio); } - WARN_ON(folio_order(folio)); - if (folio_test_readahead(folio) && !use_rst) page_cache_async_readahead(inode->i_mapping, ra, NULL, folio, last_index + 1 - index); @@ -2937,11 +2824,16 @@ static int relocate_one_folio(struct reloc_control *rc, ret = -EIO; goto release_folio; } + if (folio->mapping != inode->i_mapping) { + folio_unlock(folio); + folio_put(folio); + goto again; + } } /* * We could have lost folio private when we dropped the lock to read the - * folio above, make sure we set_page_extent_mapped here so we have any + * folio above, make sure we set_folio_extent_mapped() here so we have any * of the subpage blocksize stuff we need in place. */ ret = set_folio_extent_mapped(folio); @@ -2949,7 +2841,7 @@ static int relocate_one_folio(struct reloc_control *rc, goto release_folio; folio_start = folio_pos(folio); - folio_end = folio_start + PAGE_SIZE - 1; + folio_end = folio_start + folio_size(folio) - 1; /* * Start from the cluster, as for subpage case, the cluster can start @@ -2973,15 +2865,15 @@ static int relocate_one_folio(struct reloc_control *rc, goto release_folio; /* Mark the range delalloc and dirty for later writeback */ - lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, - &cached_state); + btrfs_lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, + clamped_end, &cached_state); ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, clamped_end, 0, &cached_state); if (ret) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, - clamped_start, clamped_end, - EXTENT_LOCKED | EXTENT_BOUNDARY, - &cached_state); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, + clamped_start, clamped_end, + EXTENT_LOCKED | EXTENT_BOUNDARY, + &cached_state); btrfs_delalloc_release_metadata(BTRFS_I(inode), clamped_len, true); btrfs_delalloc_release_extents(BTRFS_I(inode), @@ -2997,18 +2889,19 @@ static int relocate_one_folio(struct reloc_control *rc, * EXTENT_BOUNDARY bit prevents current extent from being merged * with previous extent. */ - if (in_range(cluster->boundary[*cluster_nr] - offset, folio_start, PAGE_SIZE)) { + if (in_range(cluster->boundary[*cluster_nr] - offset, + folio_start, folio_size(folio))) { u64 boundary_start = cluster->boundary[*cluster_nr] - offset; u64 boundary_end = boundary_start + fs_info->sectorsize - 1; - set_extent_bit(&BTRFS_I(inode)->io_tree, - boundary_start, boundary_end, - EXTENT_BOUNDARY, NULL); + btrfs_set_extent_bit(&BTRFS_I(inode)->io_tree, + boundary_start, boundary_end, + EXTENT_BOUNDARY, NULL); } - unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, - &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, + &cached_state); btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); cur += clamped_len; @@ -3027,6 +2920,7 @@ static int relocate_one_folio(struct reloc_control *rc, btrfs_throttle(fs_info); if (btrfs_should_cancel_balance(fs_info)) ret = -ECANCELED; + *file_offset_ret = folio_end + 1; return ret; release_folio: @@ -3040,8 +2934,7 @@ static int relocate_file_extent_cluster(struct reloc_control *rc) struct inode *inode = rc->data_inode; const struct file_extent_cluster *cluster = &rc->cluster; u64 offset = BTRFS_I(inode)->reloc_block_group_start; - unsigned long index; - unsigned long last_index; + u64 cur_file_offset = cluster->start - offset; struct file_ra_state *ra; int cluster_nr = 0; int ret = 0; @@ -3063,10 +2956,11 @@ static int relocate_file_extent_cluster(struct reloc_control *rc) if (ret) goto out; - last_index = (cluster->end - offset) >> PAGE_SHIFT; - for (index = (cluster->start - offset) >> PAGE_SHIFT; - index <= last_index && !ret; index++) - ret = relocate_one_folio(rc, ra, &cluster_nr, index); + while (cur_file_offset < cluster->end - offset) { + ret = relocate_one_folio(rc, ra, &cluster_nr, &cur_file_offset); + if (ret) + break; + } if (ret == 0) WARN_ON(cluster_nr != cluster->nr); out: @@ -3229,7 +3123,7 @@ static int add_tree_block(struct reloc_control *rc, block->key_ready = false; block->owner = owner; - rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node); + rb_node = rb_simple_insert(blocks, &block->simple_node); if (rb_node) btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr, -EEXIST); @@ -3310,21 +3204,23 @@ out: return ret; } -static int delete_block_group_cache(struct btrfs_fs_info *fs_info, - struct btrfs_block_group *block_group, +static int delete_block_group_cache(struct btrfs_block_group *block_group, struct inode *inode, u64 ino) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; + struct btrfs_inode *btrfs_inode; int ret = 0; if (inode) goto truncate; - inode = btrfs_iget(ino, root); - if (IS_ERR(inode)) + btrfs_inode = btrfs_iget(ino, root); + if (IS_ERR(btrfs_inode)) return -ENOENT; + inode = &btrfs_inode->vfs_inode; truncate: ret = btrfs_check_trunc_cache_free_space(fs_info, @@ -3384,8 +3280,7 @@ static int delete_v1_space_cache(struct extent_buffer *leaf, } if (!found) return -ENOENT; - ret = delete_block_group_cache(leaf->fs_info, block_group, NULL, - space_cache_ino); + ret = delete_block_group_cache(block_group, NULL, space_cache_ino); return ret; } @@ -3505,9 +3400,9 @@ next: goto next; } - block_found = find_first_extent_bit(&rc->processed_blocks, - key.objectid, &start, &end, - EXTENT_DIRTY, NULL); + block_found = btrfs_find_first_extent_bit(&rc->processed_blocks, + key.objectid, &start, &end, + EXTENT_DIRTY, NULL); if (block_found && start <= key.objectid) { btrfs_release_path(path); @@ -3716,7 +3611,7 @@ restart: } btrfs_release_path(path); - clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); + btrfs_clear_extent_bit(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, NULL); if (trans) { btrfs_end_transaction_throttle(trans); @@ -3793,7 +3688,6 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC); - btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; @@ -3833,10 +3727,10 @@ out: * the inode is in data relocation tree and its link count is 0 */ static noinline_for_stack struct inode *create_reloc_inode( - struct btrfs_fs_info *fs_info, const struct btrfs_block_group *group) { - struct inode *inode = NULL; + struct btrfs_fs_info *fs_info = group->fs_info; + struct btrfs_inode *inode = NULL; struct btrfs_trans_handle *trans; struct btrfs_root *root; u64 objectid; @@ -3864,18 +3758,19 @@ static noinline_for_stack struct inode *create_reloc_inode( inode = NULL; goto out; } - BTRFS_I(inode)->reloc_block_group_start = group->start; + inode->reloc_block_group_start = group->start; - ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + ret = btrfs_orphan_add(trans, inode); out: btrfs_put_root(root); btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); if (ret) { - iput(inode); - inode = ERR_PTR(ret); + if (inode) + iput(&inode->vfs_inode); + return ERR_PTR(ret); } - return inode; + return &inode->vfs_inode; } /* @@ -3932,7 +3827,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) btrfs_backref_init_cache(fs_info, &rc->backref_cache, true); rc->reloc_root_tree.rb_root = RB_ROOT; spin_lock_init(&rc->reloc_root_tree.lock); - extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); + btrfs_extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); return rc; } @@ -3953,7 +3848,7 @@ static void free_reloc_control(struct reloc_control *rc) */ static void describe_relocation(struct btrfs_block_group *block_group) { - char buf[128] = {'\0'}; + char buf[128] = "NONE"; btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); @@ -3973,7 +3868,8 @@ static const char *stage_to_string(enum reloc_stage stage) /* * function to relocate all extents in a block group. */ -int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) +int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, + bool verbose) { struct btrfs_block_group *bg; struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start); @@ -4049,7 +3945,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) btrfs_free_path(path); if (!IS_ERR(inode)) - ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); + ret = delete_block_group_cache(rc->block_group, inode, 0); else ret = PTR_ERR(inode); @@ -4058,14 +3954,15 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) goto out; } - rc->data_inode = create_reloc_inode(fs_info, rc->block_group); + rc->data_inode = create_reloc_inode(rc->block_group); if (IS_ERR(rc->data_inode)) { err = PTR_ERR(rc->data_inode); rc->data_inode = NULL; goto out; } - describe_relocation(rc->block_group); + if (verbose) + describe_relocation(rc->block_group); btrfs_wait_block_group_reservations(rc->block_group); btrfs_wait_nocow_writers(rc->block_group); @@ -4109,8 +4006,10 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) if (rc->extents_found == 0) break; - btrfs_info(fs_info, "found %llu extents, stage: %s", - rc->extents_found, stage_to_string(finishes_stage)); + if (verbose) + btrfs_info(fs_info, "found %llu extents, stage: %s", + rc->extents_found, + stage_to_string(finishes_stage)); } WARN_ON(rc->block_group->pinned > 0); @@ -4255,8 +4154,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) rc->merge_reloc_tree = true; while (!list_empty(&reloc_roots)) { - reloc_root = list_entry(reloc_roots.next, - struct btrfs_root, root_list); + reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list); list_del(&reloc_root->root_list); if (btrfs_root_refs(&reloc_root->root_item) == 0) { @@ -4349,7 +4247,7 @@ int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered) while (!list_empty(&list)) { struct btrfs_ordered_sum *sums = - list_entry(list.next, struct btrfs_ordered_sum, list); + list_first_entry(&list, struct btrfs_ordered_sum, list); list_del_init(&sums->list); @@ -4399,11 +4297,21 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, WARN_ON(!first_cow && level == 0); node = rc->backref_cache.path[level]; - BUG_ON(node->bytenr != buf->start && - node->new_bytenr != buf->start); + + /* + * If node->bytenr != buf->start and node->new_bytenr != + * buf->start then we've got the wrong backref node for what we + * expected to see here and the cache is incorrect. + */ + if (unlikely(node->bytenr != buf->start && node->new_bytenr != buf->start)) { + btrfs_err(fs_info, +"bytenr %llu was found but our backref cache was expecting %llu or %llu", + buf->start, node->bytenr, node->new_bytenr); + return -EUCLEAN; + } btrfs_backref_drop_node_buffer(node); - atomic_inc(&cow->refs); + refcount_inc(&cow->refs); node->eb = cow; node->new_bytenr = cow->start; @@ -4500,10 +4408,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, return ret; } new_root->reloc_root = btrfs_grab_root(reloc_root); - - if (rc->create_reloc_tree) - ret = clone_backref_node(trans, rc, root, reloc_root); - return ret; + return 0; } /* diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h index 788c86d8633a..5c36b3f84b57 100644 --- a/fs/btrfs/relocation.h +++ b/fs/btrfs/relocation.h @@ -12,7 +12,8 @@ struct btrfs_trans_handle; struct btrfs_ordered_extent; struct btrfs_pending_snapshot; -int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start); +int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, + bool verbose); int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 33962671a96c..e22e6b06927a 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -197,7 +197,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_set_root_generation_v2(item, btrfs_root_generation(item)); write_extent_buffer(l, item, ptr, sizeof(*item)); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); out: btrfs_free_path(path); return ret; @@ -447,7 +446,6 @@ again: btrfs_set_root_ref_name_len(leaf, ref, name->len); ptr = (unsigned long)(ref + 1); write_extent_buffer(leaf, name->name, ptr, name->len); - btrfs_mark_buffer_dirty(trans, leaf); if (key.type == BTRFS_ROOT_BACKREF_KEY) { btrfs_release_path(path); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 204c928beaf9..6776e6ab8d10 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -66,8 +66,6 @@ struct scrub_ctx; /* Represent one sector and its needed info to verify the content. */ struct scrub_sector_verification { - bool is_metadata; - union { /* * Csum pointer for data csum verification. Should point to a @@ -100,6 +98,38 @@ enum scrub_stripe_flags { SCRUB_STRIPE_FLAG_NO_REPORT, }; +/* + * We have multiple bitmaps for one scrub_stripe. + * However each bitmap has at most (BTRFS_STRIPE_LEN / blocksize) bits, + * which is normally 16, and much smaller than BITS_PER_LONG (32 or 64). + * + * So to reduce memory usage for each scrub_stripe, we pack those bitmaps + * into a larger one. + * + * These enum records where the sub-bitmap are inside the larger one. + * Each subbitmap starts at scrub_bitmap_nr_##name * nr_sectors bit. + */ +enum { + /* Which blocks are covered by extent items. */ + scrub_bitmap_nr_has_extent = 0, + + /* Which blocks are meteadata. */ + scrub_bitmap_nr_is_metadata, + + /* + * Which blocks have errors, including IO, csum, and metadata + * errors. + * This sub-bitmap is the OR results of the next few error related + * sub-bitmaps. + */ + scrub_bitmap_nr_error, + scrub_bitmap_nr_io_error, + scrub_bitmap_nr_csum_error, + scrub_bitmap_nr_meta_error, + scrub_bitmap_nr_meta_gen_error, + scrub_bitmap_nr_last, +}; + #define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE) /* @@ -138,36 +168,15 @@ struct scrub_stripe { */ unsigned long state; - /* Indicate which sectors are covered by extent items. */ - unsigned long extent_sector_bitmap; + /* The large bitmap contains all the sub-bitmaps. */ + unsigned long bitmaps[BITS_TO_LONGS(scrub_bitmap_nr_last * + (BTRFS_STRIPE_LEN / BTRFS_MIN_BLOCKSIZE))]; /* - * The errors hit during the initial read of the stripe. - * - * Would be utilized for error reporting and repair. - * - * The remaining init_nr_* records the number of errors hit, only used - * by error reporting. + * For writeback (repair or replace) error reporting. + * This one is protected by a spinlock, thus can not be packed into + * the larger bitmap. */ - unsigned long init_error_bitmap; - unsigned int init_nr_io_errors; - unsigned int init_nr_csum_errors; - unsigned int init_nr_meta_errors; - - /* - * The following error bitmaps are all for the current status. - * Every time we submit a new read, these bitmaps may be updated. - * - * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap; - * - * IO and csum errors can happen for both metadata and data. - */ - unsigned long error_bitmap; - unsigned long io_error_bitmap; - unsigned long csum_error_bitmap; - unsigned long meta_error_bitmap; - - /* For writeback (repair or replace) error reporting. */ unsigned long write_error_bitmap; /* Writeback can be concurrent, thus we need to protect the bitmap. */ @@ -219,6 +228,90 @@ struct scrub_ctx { refcount_t refs; }; +#define scrub_calc_start_bit(stripe, name, block_nr) \ +({ \ + unsigned int __start_bit; \ + \ + ASSERT(block_nr < stripe->nr_sectors, \ + "nr_sectors=%u block_nr=%u", stripe->nr_sectors, block_nr); \ + __start_bit = scrub_bitmap_nr_##name * stripe->nr_sectors + block_nr; \ + __start_bit; \ +}) + +#define IMPLEMENT_SCRUB_BITMAP_OPS(name) \ +static inline void scrub_bitmap_set_##name(struct scrub_stripe *stripe, \ + unsigned int block_nr, \ + unsigned int nr_blocks) \ +{ \ + const unsigned int start_bit = scrub_calc_start_bit(stripe, \ + name, block_nr); \ + \ + bitmap_set(stripe->bitmaps, start_bit, nr_blocks); \ +} \ +static inline void scrub_bitmap_clear_##name(struct scrub_stripe *stripe, \ + unsigned int block_nr, \ + unsigned int nr_blocks) \ +{ \ + const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ + block_nr); \ + \ + bitmap_clear(stripe->bitmaps, start_bit, nr_blocks); \ +} \ +static inline bool scrub_bitmap_test_bit_##name(struct scrub_stripe *stripe, \ + unsigned int block_nr) \ +{ \ + const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ + block_nr); \ + \ + return test_bit(start_bit, stripe->bitmaps); \ +} \ +static inline void scrub_bitmap_set_bit_##name(struct scrub_stripe *stripe, \ + unsigned int block_nr) \ +{ \ + const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ + block_nr); \ + \ + set_bit(start_bit, stripe->bitmaps); \ +} \ +static inline void scrub_bitmap_clear_bit_##name(struct scrub_stripe *stripe, \ + unsigned int block_nr) \ +{ \ + const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ + block_nr); \ + \ + clear_bit(start_bit, stripe->bitmaps); \ +} \ +static inline unsigned long scrub_bitmap_read_##name(struct scrub_stripe *stripe) \ +{ \ + const unsigned int nr_blocks = stripe->nr_sectors; \ + \ + ASSERT(nr_blocks > 0 && nr_blocks <= BITS_PER_LONG, \ + "nr_blocks=%u BITS_PER_LONG=%u", \ + nr_blocks, BITS_PER_LONG); \ + \ + return bitmap_read(stripe->bitmaps, nr_blocks * scrub_bitmap_nr_##name, \ + stripe->nr_sectors); \ +} \ +static inline bool scrub_bitmap_empty_##name(struct scrub_stripe *stripe) \ +{ \ + unsigned long bitmap = scrub_bitmap_read_##name(stripe); \ + \ + return bitmap_empty(&bitmap, stripe->nr_sectors); \ +} \ +static inline unsigned int scrub_bitmap_weight_##name(struct scrub_stripe *stripe) \ +{ \ + unsigned long bitmap = scrub_bitmap_read_##name(stripe); \ + \ + return bitmap_weight(&bitmap, stripe->nr_sectors); \ +} +IMPLEMENT_SCRUB_BITMAP_OPS(has_extent); +IMPLEMENT_SCRUB_BITMAP_OPS(is_metadata); +IMPLEMENT_SCRUB_BITMAP_OPS(error); +IMPLEMENT_SCRUB_BITMAP_OPS(io_error); +IMPLEMENT_SCRUB_BITMAP_OPS(csum_error); +IMPLEMENT_SCRUB_BITMAP_OPS(meta_error); +IMPLEMENT_SCRUB_BITMAP_OPS(meta_gen_error); + struct scrub_warning { struct btrfs_path *path; u64 extent_item_size; @@ -228,6 +321,19 @@ struct scrub_warning { struct btrfs_device *dev; }; +struct scrub_error_records { + /* + * Bitmap recording which blocks hit errors (IO/csum/...) during the + * initial read. + */ + unsigned long init_error_bitmap; + + unsigned int nr_io_errors; + unsigned int nr_csum_errors; + unsigned int nr_meta_errors; + unsigned int nr_meta_gen_errors; +}; + static void release_scrub_stripe(struct scrub_stripe *stripe) { if (!stripe) @@ -450,8 +556,8 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, * hold all of the paths here */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) - btrfs_warn_in_rcu(fs_info, -"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)", + btrfs_warn(fs_info, +"scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu length %u links %u (path: %s)", swarn->errstr, swarn->logical, btrfs_dev_name(swarn->dev), swarn->physical, @@ -464,8 +570,8 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, return 0; err: - btrfs_warn_in_rcu(fs_info, - "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", + btrfs_warn(fs_info, + "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu: path resolving failed with ret=%d", swarn->errstr, swarn->logical, btrfs_dev_name(swarn->dev), swarn->physical, @@ -490,7 +596,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * /* Super block error, no need to search extent tree. */ if (is_super) { - btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu", + btrfs_warn(fs_info, "scrub: %s on device %s, physical %llu", errstr, btrfs_dev_name(dev), physical); return; } @@ -525,14 +631,14 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * &ref_level); if (ret < 0) { btrfs_warn(fs_info, - "failed to resolve tree backref for logical %llu: %d", - swarn.logical, ret); + "scrub: failed to resolve tree backref for logical %llu: %d", + swarn.logical, ret); break; } if (ret > 0) break; - btrfs_warn_in_rcu(fs_info, -"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", + btrfs_warn(fs_info, +"scrub: %s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", errstr, swarn.logical, btrfs_dev_name(dev), swarn.physical, (ref_level ? "node" : "leaf"), ref_level, ref_root); @@ -579,20 +685,15 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) return ret; } -static struct page *scrub_stripe_get_page(struct scrub_stripe *stripe, int sector_nr) -{ - struct btrfs_fs_info *fs_info = stripe->bg->fs_info; - int page_index = (sector_nr << fs_info->sectorsize_bits) >> PAGE_SHIFT; - - return stripe->pages[page_index]; -} - -static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe *stripe, - int sector_nr) +static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr) { - struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + u32 offset = (sector_nr << stripe->bg->fs_info->sectorsize_bits); + const struct page *page = stripe->pages[offset >> PAGE_SHIFT]; - return offset_in_page(sector_nr << fs_info->sectorsize_bits); + /* stripe->pages[] is allocated by us and no highmem is allowed. */ + ASSERT(page); + ASSERT(!PageHighMem(page)); + return page_address(page) + offset_in_page(offset); } static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr) @@ -600,46 +701,44 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr struct btrfs_fs_info *fs_info = stripe->bg->fs_info; const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits); - const struct page *first_page = scrub_stripe_get_page(stripe, sector_nr); - const unsigned int first_off = scrub_stripe_get_page_offset(stripe, sector_nr); + void *first_kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); + struct btrfs_header *header = first_kaddr; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 on_disk_csum[BTRFS_CSUM_SIZE]; u8 calculated_csum[BTRFS_CSUM_SIZE]; - struct btrfs_header *header; /* * Here we don't have a good way to attach the pages (and subpages) * to a dummy extent buffer, thus we have to directly grab the members * from pages. */ - header = (struct btrfs_header *)(page_address(first_page) + first_off); memcpy(on_disk_csum, header->csum, fs_info->csum_size); if (logical != btrfs_stack_header_bytenr(header)) { - bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree); - bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad bytenr, has %llu want %llu", + "scrub: tree block %llu mirror %u has bad bytenr, has %llu want %llu", logical, stripe->mirror_num, btrfs_stack_header_bytenr(header), logical); return; } if (memcmp(header->fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != 0) { - bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); - bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad fsid, has %pU want %pU", + "scrub: tree block %llu mirror %u has bad fsid, has %pU want %pU", logical, stripe->mirror_num, header->fsid, fs_info->fs_devices->fsid); return; } if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE) != 0) { - bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); - bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", + "scrub: tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", logical, stripe->mirror_num, header->chunk_tree_uuid, fs_info->chunk_tree_uuid); return; @@ -648,23 +747,20 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr /* Now check tree block csum. */ shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); - crypto_shash_update(shash, page_address(first_page) + first_off + - BTRFS_CSUM_SIZE, fs_info->sectorsize - BTRFS_CSUM_SIZE); + crypto_shash_update(shash, first_kaddr + BTRFS_CSUM_SIZE, + fs_info->sectorsize - BTRFS_CSUM_SIZE); for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) { - struct page *page = scrub_stripe_get_page(stripe, i); - unsigned int page_off = scrub_stripe_get_page_offset(stripe, i); - - crypto_shash_update(shash, page_address(page) + page_off, + crypto_shash_update(shash, scrub_stripe_get_kaddr(stripe, i), fs_info->sectorsize); } crypto_shash_final(shash, calculated_csum); if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) { - bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); - bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, +"scrub: tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, logical, stripe->mirror_num, CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); @@ -672,18 +768,19 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr } if (stripe->sectors[sector_nr].generation != btrfs_stack_header_generation(header)) { - bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); - bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + scrub_bitmap_set_meta_gen_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad generation, has %llu want %llu", + "scrub: tree block %llu mirror %u has bad generation, has %llu want %llu", logical, stripe->mirror_num, btrfs_stack_header_generation(header), stripe->sectors[sector_nr].generation); return; } - bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree); - bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree); - bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); + scrub_bitmap_clear_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_clear_csum_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_clear_meta_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_clear_meta_gen_error(stripe, sector_nr, sectors_per_tree); } static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) @@ -691,23 +788,22 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct scrub_sector_verification *sector = &stripe->sectors[sector_nr]; const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; - struct page *page = scrub_stripe_get_page(stripe, sector_nr); - unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr); + void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); u8 csum_buf[BTRFS_CSUM_SIZE]; int ret; ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors); /* Sector not utilized, skip it. */ - if (!test_bit(sector_nr, &stripe->extent_sector_bitmap)) + if (!scrub_bitmap_test_bit_has_extent(stripe, sector_nr)) return; /* IO error, no need to check. */ - if (test_bit(sector_nr, &stripe->io_error_bitmap)) + if (scrub_bitmap_test_bit_io_error(stripe, sector_nr)) return; /* Metadata, verify the full tree block. */ - if (sector->is_metadata) { + if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) { /* * Check if the tree block crosses the stripe boundary. If * crossed the boundary, we cannot verify it but only give a @@ -718,7 +814,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) */ if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) { btrfs_warn_rl(fs_info, - "tree block at %llu crosses stripe boundary %llu", + "scrub: tree block at %llu crosses stripe boundary %llu", stripe->logical + (sector_nr << fs_info->sectorsize_bits), stripe->logical); @@ -733,17 +829,17 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) * cases without csum, we have no other choice but to trust it. */ if (!sector->csum) { - clear_bit(sector_nr, &stripe->error_bitmap); + scrub_bitmap_clear_bit_error(stripe, sector_nr); return; } - ret = btrfs_check_sector_csum(fs_info, page, pgoff, csum_buf, sector->csum); + ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, sector->csum); if (ret < 0) { - set_bit(sector_nr, &stripe->csum_error_bitmap); - set_bit(sector_nr, &stripe->error_bitmap); + scrub_bitmap_set_bit_csum_error(stripe, sector_nr); + scrub_bitmap_set_bit_error(stripe, sector_nr); } else { - clear_bit(sector_nr, &stripe->csum_error_bitmap); - clear_bit(sector_nr, &stripe->error_bitmap); + scrub_bitmap_clear_bit_csum_error(stripe, sector_nr); + scrub_bitmap_clear_bit_error(stripe, sector_nr); } } @@ -756,7 +852,7 @@ static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long b for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) { scrub_verify_one_sector(stripe, sector_nr); - if (stripe->sectors[sector_nr].is_metadata) + if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) sector_nr += sectors_per_tree - 1; } } @@ -766,8 +862,7 @@ static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first int i; for (i = 0; i < stripe->nr_sectors; i++) { - if (scrub_stripe_get_page(stripe, i) == first_bvec->bv_page && - scrub_stripe_get_page_offset(stripe, i) == first_bvec->bv_offset) + if (scrub_stripe_get_kaddr(stripe, i) == bvec_virt(first_bvec)) break; } ASSERT(i < stripe->nr_sectors); @@ -795,13 +890,13 @@ static void scrub_repair_read_endio(struct btrfs_bio *bbio) bio_size += bvec->bv_len; if (bbio->bio.bi_status) { - bitmap_set(&stripe->io_error_bitmap, sector_nr, - bio_size >> fs_info->sectorsize_bits); - bitmap_set(&stripe->error_bitmap, sector_nr, - bio_size >> fs_info->sectorsize_bits); + scrub_bitmap_set_io_error(stripe, sector_nr, + bio_size >> fs_info->sectorsize_bits); + scrub_bitmap_set_error(stripe, sector_nr, + bio_size >> fs_info->sectorsize_bits); } else { - bitmap_clear(&stripe->io_error_bitmap, sector_nr, - bio_size >> fs_info->sectorsize_bits); + scrub_bitmap_clear_io_error(stripe, sector_nr, + bio_size >> fs_info->sectorsize_bits); } bio_put(&bbio->bio); if (atomic_dec_and_test(&stripe->pending_io)) @@ -814,27 +909,39 @@ static int calc_next_mirror(int mirror, int num_copies) return (mirror + 1 > num_copies) ? 1 : mirror + 1; } +static void scrub_bio_add_sector(struct btrfs_bio *bbio, struct scrub_stripe *stripe, + int sector_nr) +{ + void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); + int ret; + + ret = bio_add_page(&bbio->bio, virt_to_page(kaddr), bbio->fs_info->sectorsize, + offset_in_page(kaddr)); + /* + * Caller should ensure the bbio has enough size. + * And we cannot use __bio_add_page(), which doesn't do any merge. + * + * Meanwhile for scrub_submit_initial_read() we fully rely on the merge + * to create the minimal amount of bio vectors, for fs block size < page + * size cases. + */ + ASSERT(ret == bbio->fs_info->sectorsize); +} + static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, int mirror, int blocksize, bool wait) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; - const unsigned long old_error_bitmap = stripe->error_bitmap; + const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); int i; ASSERT(stripe->mirror_num >= 1); ASSERT(atomic_read(&stripe->pending_io) == 0); for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) { - struct page *page; - int pgoff; - int ret; - - page = scrub_stripe_get_page(stripe, i); - pgoff = scrub_stripe_get_page_offset(stripe, i); - /* The current sector cannot be merged, submit the bio. */ - if (bbio && ((i > 0 && !test_bit(i - 1, &stripe->error_bitmap)) || + if (bbio && ((i > 0 && !test_bit(i - 1, &old_error_bitmap)) || bbio->bio.bi_iter.bi_size >= blocksize)) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); @@ -851,8 +958,7 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, (i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT; } - ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); - ASSERT(ret == fs_info->sectorsize); + scrub_bio_add_sector(bbio, stripe, i); } if (bbio) { ASSERT(bbio->bio.bi_iter.bi_size); @@ -864,12 +970,15 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, } static void scrub_stripe_report_errors(struct scrub_ctx *sctx, - struct scrub_stripe *stripe) + struct scrub_stripe *stripe, + const struct scrub_error_records *errors) { static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_device *dev = NULL; + const unsigned long extent_bitmap = scrub_bitmap_read_has_extent(stripe); + const unsigned long error_bitmap = scrub_bitmap_read_error(stripe); u64 physical = 0; int nr_data_sectors = 0; int nr_meta_sectors = 0; @@ -886,7 +995,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio() * thus no need for dev/physical, error reporting still needs dev and physical. */ - if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) { + if (!bitmap_empty(&errors->init_error_bitmap, stripe->nr_sectors)) { u64 mapped_len = fs_info->sectorsize; struct btrfs_io_context *bioc = NULL; int stripe_index = stripe->mirror_num - 1; @@ -909,10 +1018,10 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, } skip: - for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) { + for_each_set_bit(sector_nr, &extent_bitmap, stripe->nr_sectors) { bool repaired = false; - if (stripe->sectors[sector_nr].is_metadata) { + if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) { nr_meta_sectors++; } else { nr_data_sectors++; @@ -920,14 +1029,14 @@ skip: nr_nodatacsum_sectors++; } - if (test_bit(sector_nr, &stripe->init_error_bitmap) && - !test_bit(sector_nr, &stripe->error_bitmap)) { + if (test_bit(sector_nr, &errors->init_error_bitmap) && + !test_bit(sector_nr, &error_bitmap)) { nr_repaired_sectors++; repaired = true; } /* Good sector from the beginning, nothing need to be done. */ - if (!test_bit(sector_nr, &stripe->init_error_bitmap)) + if (!test_bit(sector_nr, &errors->init_error_bitmap)) continue; /* @@ -936,13 +1045,13 @@ skip: */ if (repaired) { if (dev) { - btrfs_err_rl_in_rcu(fs_info, - "fixed up error at logical %llu on dev %s physical %llu", + btrfs_err_rl(fs_info, + "scrub: fixed up error at logical %llu on dev %s physical %llu", stripe->logical, btrfs_dev_name(dev), physical); } else { - btrfs_err_rl_in_rcu(fs_info, - "fixed up error at logical %llu on mirror %u", + btrfs_err_rl(fs_info, + "scrub: fixed up error at logical %llu on mirror %u", stripe->logical, stripe->mirror_num); } continue; @@ -950,41 +1059,56 @@ skip: /* The remaining are all for unrepaired. */ if (dev) { - btrfs_err_rl_in_rcu(fs_info, - "unable to fixup (regular) error at logical %llu on dev %s physical %llu", + btrfs_err_rl(fs_info, +"scrub: unable to fixup (regular) error at logical %llu on dev %s physical %llu", stripe->logical, btrfs_dev_name(dev), physical); } else { - btrfs_err_rl_in_rcu(fs_info, - "unable to fixup (regular) error at logical %llu on mirror %u", + btrfs_err_rl(fs_info, + "scrub: unable to fixup (regular) error at logical %llu on mirror %u", stripe->logical, stripe->mirror_num); } - if (test_bit(sector_nr, &stripe->io_error_bitmap)) + if (scrub_bitmap_test_bit_io_error(stripe, sector_nr)) if (__ratelimit(&rs) && dev) scrub_print_common_warning("i/o error", dev, false, stripe->logical, physical); - if (test_bit(sector_nr, &stripe->csum_error_bitmap)) + if (scrub_bitmap_test_bit_csum_error(stripe, sector_nr)) if (__ratelimit(&rs) && dev) scrub_print_common_warning("checksum error", dev, false, stripe->logical, physical); - if (test_bit(sector_nr, &stripe->meta_error_bitmap)) + if (scrub_bitmap_test_bit_meta_error(stripe, sector_nr)) if (__ratelimit(&rs) && dev) scrub_print_common_warning("header error", dev, false, stripe->logical, physical); + if (scrub_bitmap_test_bit_meta_gen_error(stripe, sector_nr)) + if (__ratelimit(&rs) && dev) + scrub_print_common_warning("generation error", dev, false, + stripe->logical, physical); } + /* Update the device stats. */ + for (int i = 0; i < errors->nr_io_errors; i++) + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS); + for (int i = 0; i < errors->nr_csum_errors; i++) + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); + /* Generation mismatch error is based on each metadata, not each block. */ + for (int i = 0; i < errors->nr_meta_gen_errors; + i += (fs_info->nodesize >> fs_info->sectorsize_bits)) + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_GENERATION_ERRS); + spin_lock(&sctx->stat_lock); sctx->stat.data_extents_scrubbed += stripe->nr_data_extents; sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents; sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits; sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits; sctx->stat.no_csum += nr_nodatacsum_sectors; - sctx->stat.read_errors += stripe->init_nr_io_errors; - sctx->stat.csum_errors += stripe->init_nr_csum_errors; - sctx->stat.verify_errors += stripe->init_nr_meta_errors; + sctx->stat.read_errors += errors->nr_io_errors; + sctx->stat.csum_errors += errors->nr_csum_errors; + sctx->stat.verify_errors += errors->nr_meta_errors + + errors->nr_meta_gen_errors; sctx->stat.uncorrectable_errors += - bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors); + bitmap_weight(&error_bitmap, stripe->nr_sectors); sctx->stat.corrected_errors += nr_repaired_sectors; spin_unlock(&sctx->stat_lock); } @@ -1010,26 +1134,26 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work); struct scrub_ctx *sctx = stripe->sctx; struct btrfs_fs_info *fs_info = sctx->fs_info; + struct scrub_error_records errors = { 0 }; int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, stripe->bg->length); unsigned long repaired; + unsigned long error; int mirror; int i; ASSERT(stripe->mirror_num > 0); wait_scrub_stripe_io(stripe); - scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap); + scrub_verify_one_stripe(stripe, scrub_bitmap_read_has_extent(stripe)); /* Save the initial failed bitmap for later repair and report usage. */ - stripe->init_error_bitmap = stripe->error_bitmap; - stripe->init_nr_io_errors = bitmap_weight(&stripe->io_error_bitmap, - stripe->nr_sectors); - stripe->init_nr_csum_errors = bitmap_weight(&stripe->csum_error_bitmap, - stripe->nr_sectors); - stripe->init_nr_meta_errors = bitmap_weight(&stripe->meta_error_bitmap, - stripe->nr_sectors); - - if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) + errors.init_error_bitmap = scrub_bitmap_read_error(stripe); + errors.nr_io_errors = scrub_bitmap_weight_io_error(stripe); + errors.nr_csum_errors = scrub_bitmap_weight_csum_error(stripe); + errors.nr_meta_errors = scrub_bitmap_weight_meta_error(stripe); + errors.nr_meta_gen_errors = scrub_bitmap_weight_meta_gen_error(stripe); + + if (bitmap_empty(&errors.init_error_bitmap, stripe->nr_sectors)) goto out; /* @@ -1041,13 +1165,13 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) for (mirror = calc_next_mirror(stripe->mirror_num, num_copies); mirror != stripe->mirror_num; mirror = calc_next_mirror(mirror, num_copies)) { - const unsigned long old_error_bitmap = stripe->error_bitmap; + const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); scrub_stripe_submit_repair_read(stripe, mirror, BTRFS_STRIPE_LEN, false); wait_scrub_stripe_io(stripe); scrub_verify_one_stripe(stripe, old_error_bitmap); - if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) + if (scrub_bitmap_empty_error(stripe)) goto out; } @@ -1065,21 +1189,22 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) for (i = 0, mirror = stripe->mirror_num; i < num_copies; i++, mirror = calc_next_mirror(mirror, num_copies)) { - const unsigned long old_error_bitmap = stripe->error_bitmap; + const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); scrub_stripe_submit_repair_read(stripe, mirror, fs_info->sectorsize, true); wait_scrub_stripe_io(stripe); scrub_verify_one_stripe(stripe, old_error_bitmap); - if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) + if (scrub_bitmap_empty_error(stripe)) goto out; } out: + error = scrub_bitmap_read_error(stripe); /* * Submit the repaired sectors. For zoned case, we cannot do repair * in-place, but queue the bg to be relocated. */ - bitmap_andnot(&repaired, &stripe->init_error_bitmap, &stripe->error_bitmap, + bitmap_andnot(&repaired, &errors.init_error_bitmap, &error, stripe->nr_sectors); if (!sctx->readonly && !bitmap_empty(&repaired, stripe->nr_sectors)) { if (btrfs_is_zoned(fs_info)) { @@ -1090,7 +1215,7 @@ out: } } - scrub_stripe_report_errors(sctx, stripe); + scrub_stripe_report_errors(sctx, stripe, &errors); set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state); wake_up(&stripe->repair_wait); } @@ -1110,10 +1235,10 @@ static void scrub_read_endio(struct btrfs_bio *bbio) num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; if (bbio->bio.bi_status) { - bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors); - bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors); + scrub_bitmap_set_io_error(stripe, sector_nr, num_sectors); + scrub_bitmap_set_error(stripe, sector_nr, num_sectors); } else { - bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors); + scrub_bitmap_clear_io_error(stripe, sector_nr, num_sectors); } bio_put(&bbio->bio); if (atomic_dec_and_test(&stripe->pending_io)) { @@ -1142,6 +1267,9 @@ static void scrub_write_endio(struct btrfs_bio *bbio) bitmap_set(&stripe->write_error_bitmap, sector_nr, bio_size >> fs_info->sectorsize_bits); spin_unlock_irqrestore(&stripe->write_error_lock, flags); + for (int i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++) + btrfs_dev_stat_inc_and_print(stripe->dev, + BTRFS_DEV_STAT_WRITE_ERRS); } bio_put(&bbio->bio); @@ -1199,12 +1327,8 @@ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str int sector_nr; for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) { - struct page *page = scrub_stripe_get_page(stripe, sector_nr); - unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr); - int ret; - /* We should only writeback sectors covered by an extent. */ - ASSERT(test_bit(sector_nr, &stripe->extent_sector_bitmap)); + ASSERT(scrub_bitmap_test_bit_has_extent(stripe, sector_nr)); /* Cannot merge with previous sector, submit the current one. */ if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) { @@ -1218,8 +1342,7 @@ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str (sector_nr << fs_info->sectorsize_bits)) >> SECTOR_SHIFT; } - ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); - ASSERT(ret == fs_info->sectorsize); + scrub_bio_add_sector(bbio, stripe, sector_nr); } if (bbio) scrub_submit_write_bio(sctx, stripe, bbio, dev_replace); @@ -1380,11 +1503,11 @@ static int find_first_extent_item(struct btrfs_root *extent_root, if (path->nodes[0]) goto search_forward; + key.objectid = search_start; if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; - key.objectid = search_start; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); @@ -1470,8 +1593,7 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, physical, sctx->write_pointer); if (ret) - btrfs_err(fs_info, - "zoned: failed to recover write pointer"); + btrfs_err(fs_info, "scrub: zoned: failed to recover write pointer"); } mutex_unlock(&sctx->wr_lock); btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); @@ -1493,9 +1615,9 @@ static void fill_one_extent_info(struct btrfs_fs_info *fs_info, struct scrub_sector_verification *sector = &stripe->sectors[nr_sector]; - set_bit(nr_sector, &stripe->extent_sector_bitmap); + scrub_bitmap_set_bit_has_extent(stripe, nr_sector); if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - sector->is_metadata = true; + scrub_bitmap_set_bit_is_metadata(stripe, nr_sector); sector->generation = extent_gen; } } @@ -1503,15 +1625,8 @@ static void fill_one_extent_info(struct btrfs_fs_info *fs_info, static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) { - stripe->extent_sector_bitmap = 0; - stripe->init_error_bitmap = 0; - stripe->init_nr_io_errors = 0; - stripe->init_nr_csum_errors = 0; - stripe->init_nr_meta_errors = 0; - stripe->error_bitmap = 0; - stripe->io_error_bitmap = 0; - stripe->csum_error_bitmap = 0; - stripe->meta_error_bitmap = 0; + ASSERT(stripe->nr_sectors); + bitmap_zero(stripe->bitmaps, scrub_bitmap_nr_last * stripe->nr_sectors); } /* @@ -1541,6 +1656,10 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, u64 extent_gen; int ret; + if (unlikely(!extent_root || !csum_root)) { + btrfs_err(fs_info, "scrub: no valid extent or csum root found"); + return -EUCLEAN; + } memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * stripe->nr_sectors); scrub_stripe_reset_bitmaps(stripe); @@ -1642,7 +1761,6 @@ static void scrub_reset_stripe(struct scrub_stripe *stripe) stripe->state = 0; for (int i = 0; i < stripe->nr_sectors; i++) { - stripe->sectors[i].is_metadata = false; stripe->sectors[i].csum = NULL; stripe->sectors[i].generation = 0; } @@ -1661,24 +1779,21 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; + const unsigned long has_extent = scrub_bitmap_read_has_extent(stripe); u64 stripe_len = BTRFS_STRIPE_LEN; int mirror = stripe->mirror_num; int i; atomic_inc(&stripe->pending_io); - for_each_set_bit(i, &stripe->extent_sector_bitmap, stripe->nr_sectors) { - struct page *page = scrub_stripe_get_page(stripe, i); - unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i); - + for_each_set_bit(i, &has_extent, stripe->nr_sectors) { /* We're beyond the chunk boundary, no need to read anymore. */ if (i >= nr_sectors) break; /* The current sector cannot be merged, submit the bio. */ if (bbio && - ((i > 0 && - !test_bit(i - 1, &stripe->extent_sector_bitmap)) || + ((i > 0 && !test_bit(i - 1, &has_extent)) || bbio->bio.bi_iter.bi_size >= stripe_len)) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); @@ -1691,7 +1806,7 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) struct btrfs_io_context *bioc = NULL; const u64 logical = stripe->logical + (i << fs_info->sectorsize_bits); - int err; + int ret; io_stripe.rst_search_commit_root = true; stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits; @@ -1699,11 +1814,11 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) * For RST cases, we need to manually split the bbio to * follow the RST boundary. */ - err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, &stripe_len, &bioc, &io_stripe, &mirror); btrfs_put_bioc(bioc); - if (err < 0) { - if (err != -ENODATA) { + if (ret < 0) { + if (ret != -ENODATA) { /* * Earlier btrfs_get_raid_extent_offset() * returned -ENODATA, which means there's @@ -1712,8 +1827,8 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) * the extent tree, then it's a preallocated * extent and not an error. */ - set_bit(i, &stripe->io_error_bitmap); - set_bit(i, &stripe->error_bitmap); + scrub_bitmap_set_bit_io_error(stripe, i); + scrub_bitmap_set_bit_error(stripe, i); } continue; } @@ -1723,7 +1838,7 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; } - __bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); + scrub_bio_add_sector(bbio, stripe, i); } if (bbio) { @@ -1761,15 +1876,8 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; /* Read the whole range inside the chunk boundary. */ - for (unsigned int cur = 0; cur < nr_sectors; cur++) { - struct page *page = scrub_stripe_get_page(stripe, cur); - unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur); - int ret; - - ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); - /* We should have allocated enough bio vectors. */ - ASSERT(ret == fs_info->sectorsize); - } + for (unsigned int cur = 0; cur < nr_sectors; cur++) + scrub_bio_add_sector(bbio, stripe, cur); atomic_inc(&stripe->pending_io); /* @@ -1790,14 +1898,15 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, static bool stripe_has_metadata_error(struct scrub_stripe *stripe) { + const unsigned long error = scrub_bitmap_read_error(stripe); int i; - for_each_set_bit(i, &stripe->error_bitmap, stripe->nr_sectors) { - if (stripe->sectors[i].is_metadata) { + for_each_set_bit(i, &error, stripe->nr_sectors) { + if (scrub_bitmap_test_bit_is_metadata(stripe, i)) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; btrfs_err(fs_info, - "stripe %llu has unrepaired metadata sector at %llu", + "scrub: stripe %llu has unrepaired metadata sector at logical %llu", stripe->logical, stripe->logical + (i << fs_info->sectorsize_bits)); return true; @@ -1868,13 +1977,16 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) } for (int i = 0; i < nr_stripes; i++) { unsigned long good; + unsigned long has_extent; + unsigned long error; stripe = &sctx->stripes[i]; ASSERT(stripe->dev == fs_info->dev_replace.srcdev); - bitmap_andnot(&good, &stripe->extent_sector_bitmap, - &stripe->error_bitmap, stripe->nr_sectors); + has_extent = scrub_bitmap_read_has_extent(stripe); + error = scrub_bitmap_read_error(stripe); + bitmap_andnot(&good, &has_extent, &error, stripe->nr_sectors); scrub_write_sectors(sctx, stripe, good, true); } } @@ -2008,7 +2120,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, /* Check if all data stripes are empty. */ for (int i = 0; i < data_stripes; i++) { stripe = &sctx->raid56_data_stripes[i]; - if (!bitmap_empty(&stripe->extent_sector_bitmap, stripe->nr_sectors)) { + if (!scrub_bitmap_empty_has_extent(stripe)) { all_empty = false; break; } @@ -2040,25 +2152,28 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, */ for (int i = 0; i < data_stripes; i++) { unsigned long error; + unsigned long has_extent; stripe = &sctx->raid56_data_stripes[i]; + error = scrub_bitmap_read_error(stripe); + has_extent = scrub_bitmap_read_has_extent(stripe); + /* * We should only check the errors where there is an extent. * As we may hit an empty data stripe while it's missing. */ - bitmap_and(&error, &stripe->error_bitmap, - &stripe->extent_sector_bitmap, stripe->nr_sectors); + bitmap_and(&error, &error, &has_extent, stripe->nr_sectors); if (!bitmap_empty(&error, stripe->nr_sectors)) { btrfs_err(fs_info, -"unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", +"scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", full_stripe_start, i, stripe->nr_sectors, &error); ret = -EIO; goto out; } - bitmap_or(&extent_bitmap, &extent_bitmap, - &stripe->extent_sector_bitmap, stripe->nr_sectors); + bitmap_or(&extent_bitmap, &extent_bitmap, &has_extent, + stripe->nr_sectors); } /* Now we can check and regenerate the P/Q stripe. */ @@ -2493,8 +2608,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, path->skip_locking = 1; key.objectid = scrub_dev->devid; - key.offset = 0ull; key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = 0ull; while (1) { u64 dev_extent_len; @@ -2673,14 +2788,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, ro_set = 0; } else if (ret == -ETXTBSY) { btrfs_warn(fs_info, - "skipping scrub of block group %llu due to active swapfile", + "scrub: skipping scrub of block group %llu due to active swapfile", cache->start); scrub_pause_off(fs_info); ret = 0; goto skip_unfreeze; } else { - btrfs_warn(fs_info, - "failed setting block group ro: %d", ret); + btrfs_warn(fs_info, "scrub: failed setting block group ro: %d", + ret); btrfs_unfreeze_block_group(cache); btrfs_put_block_group(cache); scrub_pause_off(fs_info); @@ -2766,29 +2881,23 @@ static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, struct page *page, u64 physical, u64 generation) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct bio_vec bvec; - struct bio bio; struct btrfs_super_block *sb = page_address(page); int ret; - bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ); - bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT; - __bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0); - ret = submit_bio_wait(&bio); - bio_uninit(&bio); - + ret = bdev_rw_virt(dev->bdev, physical >> SECTOR_SHIFT, sb, + BTRFS_SUPER_INFO_SIZE, REQ_OP_READ); if (ret < 0) return ret; ret = btrfs_check_super_csum(fs_info, sb); if (ret != 0) { btrfs_err_rl(fs_info, - "super block at physical %llu devid %llu has bad csum", + "scrub: super block at physical %llu devid %llu has bad csum", physical, dev->devid); return -EIO; } if (btrfs_super_generation(sb) != generation) { btrfs_err_rl(fs_info, -"super block at physical %llu devid %llu has bad generation %llu expect %llu", +"scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu", physical, dev->devid, btrfs_super_generation(sb), generation); return -EUCLEAN; @@ -2948,8 +3057,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (!is_dev_replace && !readonly && !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); - btrfs_err_in_rcu(fs_info, - "scrub on devid %llu: filesystem on %s is not writable", + btrfs_err(fs_info, + "scrub: devid %llu: filesystem on %s is not writable", devid, btrfs_dev_name(dev)); ret = -EROFS; goto out; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 7254279c3cc9..7664025a5af4 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4,6 +4,7 @@ */ #include <linux/bsearch.h> +#include <linux/falloc.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/sort.h> @@ -16,7 +17,6 @@ #include <linux/compat.h> #include <linux/crc32c.h> #include <linux/fsverity.h> - #include "send.h" #include "ctree.h" #include "backref.h" @@ -178,6 +178,7 @@ struct send_ctx { u64 cur_inode_rdev; u64 cur_inode_last_extent; u64 cur_inode_next_write_offset; + struct fs_path cur_inode_path; bool cur_inode_new; bool cur_inode_new_gen; bool cur_inode_deleted; @@ -383,11 +384,11 @@ static void inconsistent_snapshot_error(struct send_ctx *sctx, result_string = "updated"; break; case BTRFS_COMPARE_TREE_SAME: - ASSERT(0); + DEBUG_WARN("no change between trees"); result_string = "unchanged"; break; default: - ASSERT(0); + DEBUG_WARN("unexpected comparison result %d", result); result_string = "unexpected"; } @@ -425,15 +426,21 @@ static int need_send_hole(struct send_ctx *sctx) static void fs_path_reset(struct fs_path *p) { - if (p->reversed) { + if (p->reversed) p->start = p->buf + p->buf_len - 1; - p->end = p->start; - *p->start = 0; - } else { + else p->start = p->buf; - p->end = p->start; - *p->start = 0; - } + + p->end = p->start; + *p->start = 0; +} + +static void init_path(struct fs_path *p) +{ + p->reversed = 0; + p->buf = p->inline_buf; + p->buf_len = FS_PATH_INLINE_SIZE; + fs_path_reset(p); } static struct fs_path *fs_path_alloc(void) @@ -443,10 +450,7 @@ static struct fs_path *fs_path_alloc(void) p = kmalloc(sizeof(*p), GFP_KERNEL); if (!p) return NULL; - p->reversed = 0; - p->buf = p->inline_buf; - p->buf_len = FS_PATH_INLINE_SIZE; - fs_path_reset(p); + init_path(p); return p; } @@ -471,7 +475,7 @@ static void fs_path_free(struct fs_path *p) kfree(p); } -static int fs_path_len(struct fs_path *p) +static inline int fs_path_len(const struct fs_path *p) { return p->end - p->start; } @@ -487,12 +491,10 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) if (p->buf_len >= len) return 0; - if (len > PATH_MAX) { - WARN_ON(1); - return -ENOMEM; - } + if (WARN_ON(len > PATH_MAX)) + return -ENAMETOOLONG; - path_len = p->end - p->start; + path_len = fs_path_len(p); old_buf_len = p->buf_len; /* @@ -533,12 +535,12 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len, int ret; int new_len; - new_len = p->end - p->start + name_len; + new_len = fs_path_len(p) + name_len; if (p->start != p->end) new_len++; ret = fs_path_ensure_buf(p, new_len); if (ret < 0) - goto out; + return ret; if (p->reversed) { if (p->start != p->end) @@ -553,8 +555,7 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len, *p->end = 0; } -out: - return ret; + return 0; } static int fs_path_add(struct fs_path *p, const char *name, int name_len) @@ -564,25 +565,15 @@ static int fs_path_add(struct fs_path *p, const char *name, int name_len) ret = fs_path_prepare_for_add(p, name_len, &prepared); if (ret < 0) - goto out; + return ret; memcpy(prepared, name, name_len); -out: - return ret; + return 0; } -static int fs_path_add_path(struct fs_path *p, struct fs_path *p2) +static inline int fs_path_add_path(struct fs_path *p, const struct fs_path *p2) { - int ret; - char *prepared; - - ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared); - if (ret < 0) - goto out; - memcpy(prepared, p2->start, p2->end - p2->start); - -out: - return ret; + return fs_path_add(p, p2->start, fs_path_len(p2)); } static int fs_path_add_from_extent_buffer(struct fs_path *p, @@ -594,12 +585,11 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p, ret = fs_path_prepare_for_add(p, len, &prepared); if (ret < 0) - goto out; + return ret; read_extent_buffer(eb, prepared, off, len); -out: - return ret; + return 0; } static int fs_path_copy(struct fs_path *p, struct fs_path *from) @@ -619,13 +609,21 @@ static void fs_path_unreverse(struct fs_path *p) return; tmp = p->start; - len = p->end - p->start; + len = fs_path_len(p); p->start = p->buf; p->end = p->start + len; memmove(p->start, tmp, len + 1); p->reversed = 0; } +static inline bool is_current_inode_path(const struct send_ctx *sctx, + const struct fs_path *path) +{ + const struct fs_path *cur = &sctx->cur_inode_path; + + return (strncmp(path->start, cur->start, fs_path_len(cur)) == 0); +} + static struct btrfs_path *alloc_path_for_send(void) { struct btrfs_path *path; @@ -740,7 +738,7 @@ static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr, #define TLV_PUT_PATH(sctx, attrtype, p) \ do { \ ret = tlv_put_string(sctx, attrtype, p->start, \ - p->end - p->start); \ + fs_path_len((p))); \ if (ret < 0) \ goto tlv_put_failure; \ } while(0) @@ -761,7 +759,7 @@ static int send_header(struct send_ctx *sctx) { struct btrfs_stream_header hdr; - strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); + strscpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); hdr.version = cpu_to_le32(sctx->proto); return write_buf(sctx->send_filp, &hdr, sizeof(hdr), &sctx->send_off); @@ -819,14 +817,11 @@ static int send_cmd(struct send_ctx *sctx) static int send_rename(struct send_ctx *sctx, struct fs_path *from, struct fs_path *to) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret; - btrfs_debug(fs_info, "send_rename %s -> %s", from->start, to->start); - ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from); TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to); @@ -834,7 +829,6 @@ static int send_rename(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -844,14 +838,11 @@ out: static int send_link(struct send_ctx *sctx, struct fs_path *path, struct fs_path *lnk) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret; - btrfs_debug(fs_info, "send_link %s -> %s", path->start, lnk->start); - ret = begin_cmd(sctx, BTRFS_SEND_C_LINK); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk); @@ -859,7 +850,6 @@ static int send_link(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -868,21 +858,17 @@ out: */ static int send_unlink(struct send_ctx *sctx, struct fs_path *path) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret; - btrfs_debug(fs_info, "send_unlink %s", path->start); - ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -891,21 +877,17 @@ out: */ static int send_rmdir(struct send_ctx *sctx, struct fs_path *path) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret; - btrfs_debug(fs_info, "send_rmdir %s", path->start); - ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -1580,7 +1562,6 @@ static int find_extent_clone(struct send_ctx *sctx, struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret; int extent_type; - u64 logical; u64 disk_byte; u64 num_bytes; struct btrfs_file_extent_item *fi; @@ -1611,7 +1592,6 @@ static int find_extent_clone(struct send_ctx *sctx, compressed = btrfs_file_extent_compression(eb, fi); num_bytes = btrfs_file_extent_num_bytes(eb, fi); - logical = disk_byte + btrfs_file_extent_offset(eb, fi); /* * Setup the clone roots. @@ -1693,14 +1673,8 @@ static int find_extent_clone(struct send_ctx *sctx, } up_read(&fs_info->commit_root_sem); - btrfs_debug(fs_info, - "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu", - data_offset, ino, num_bytes, logical); - - if (!backref_ctx.found) { - btrfs_debug(fs_info, "no clones found"); + if (!backref_ctx.found) return -ENOENT; - } cur_clone_root = NULL; for (i = 0; i < sctx->clone_roots_cnt; i++) { @@ -1831,7 +1805,7 @@ static int gen_unique_name(struct send_ctx *sctx, ino, gen, idx); ASSERT(len < sizeof(tmp)); tmp_name.name = tmp; - tmp_name.len = strlen(tmp); + tmp_name.len = len; di = btrfs_lookup_dir_item(NULL, sctx->send_root, path, BTRFS_FIRST_FREE_OBJECTID, @@ -1870,7 +1844,7 @@ static int gen_unique_name(struct send_ctx *sctx, break; } - ret = fs_path_add(dest, tmp, strlen(tmp)); + ret = fs_path_add(dest, tmp, len); out: btrfs_free_path(path); @@ -1897,7 +1871,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, ret = get_inode_info(sctx->send_root, ino, &info); if (ret < 0 && ret != -ENOENT) - goto out; + return ret; left_ret = (info.nlink == 0) ? -ENOENT : ret; left_gen = info.gen; if (send_gen) @@ -1908,7 +1882,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, } else { ret = get_inode_info(sctx->parent_root, ino, &info); if (ret < 0 && ret != -ENOENT) - goto out; + return ret; right_ret = (info.nlink == 0) ? -ENOENT : ret; right_gen = info.gen; if (parent_gen) @@ -1953,7 +1927,6 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, ret = -ENOENT; } -out: return ret; } @@ -1967,17 +1940,14 @@ static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen, ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen); if (ret < 0) - goto out; + return ret; if (ret == inode_state_no_change || ret == inode_state_did_create || ret == inode_state_will_delete) - ret = 1; - else - ret = 0; + return 1; -out: - return ret; + return 0; } /* @@ -2326,9 +2296,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, *parent_gen = nce->parent_gen; ret = fs_path_add(dest, nce->name, nce->name_len); if (ret < 0) - goto out; - ret = nce->ret; - goto out; + return ret; + return nce->ret; } } @@ -2339,12 +2308,12 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, */ ret = is_inode_existent(sctx, ino, gen, NULL, NULL); if (ret < 0) - goto out; + return ret; if (!ret) { ret = gen_unique_name(sctx, ino, gen, dest); if (ret < 0) - goto out; + return ret; ret = 1; goto out_cache; } @@ -2360,21 +2329,21 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, ret = get_first_ref(sctx->parent_root, ino, parent_ino, parent_gen, dest); if (ret < 0) - goto out; + return ret; /* * Check if the ref was overwritten by an inode's ref that was processed * earlier. If yes, treat as orphan and return 1. */ ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen, - dest->start, dest->end - dest->start); + dest->start, fs_path_len(dest)); if (ret < 0) - goto out; + return ret; if (ret) { fs_path_reset(dest); ret = gen_unique_name(sctx, ino, gen, dest); if (ret < 0) - goto out; + return ret; ret = 1; } @@ -2383,10 +2352,8 @@ out_cache: * Store the result of the lookup in the name cache. */ nce = kmalloc(sizeof(*nce) + fs_path_len(dest), GFP_KERNEL); - if (!nce) { - ret = -ENOMEM; - goto out; - } + if (!nce) + return -ENOMEM; nce->entry.key = ino; nce->entry.gen = gen; @@ -2404,10 +2371,9 @@ out_cache: nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL); if (nce_ret < 0) { kfree(nce); - ret = nce_ret; + return nce_ret; } -out: return ret; } @@ -2444,6 +2410,14 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, u64 parent_inode = 0; u64 parent_gen = 0; int stop = 0; + const bool is_cur_inode = (ino == sctx->cur_ino && gen == sctx->cur_inode_gen); + + if (is_cur_inode && fs_path_len(&sctx->cur_inode_path) > 0) { + if (dest != &sctx->cur_inode_path) + return fs_path_copy(dest, &sctx->cur_inode_path); + + return 0; + } name = fs_path_alloc(); if (!name) { @@ -2495,8 +2469,12 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, out: fs_path_free(name); - if (!ret) + if (!ret) { fs_path_unreverse(dest); + if (is_cur_inode && dest != &sctx->cur_inode_path) + ret = fs_path_copy(&sctx->cur_inode_path, dest); + } + return ret; } @@ -2591,25 +2569,60 @@ out: return ret; } +static struct fs_path *get_cur_inode_path(struct send_ctx *sctx) +{ + if (fs_path_len(&sctx->cur_inode_path) == 0) { + int ret; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, + &sctx->cur_inode_path); + if (ret < 0) + return ERR_PTR(ret); + } + + return &sctx->cur_inode_path; +} + +static struct fs_path *get_path_for_command(struct send_ctx *sctx, u64 ino, u64 gen) +{ + struct fs_path *path; + int ret; + + if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen) + return get_cur_inode_path(sctx); + + path = fs_path_alloc(); + if (!path) + return ERR_PTR(-ENOMEM); + + ret = get_cur_path(sctx, ino, gen, path); + if (ret < 0) { + fs_path_free(path); + return ERR_PTR(ret); + } + + return path; +} + +static void free_path_for_command(const struct send_ctx *sctx, struct fs_path *path) +{ + if (path != &sctx->cur_inode_path) + fs_path_free(path); +} + static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; - btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size); - - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE); if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size); @@ -2617,29 +2630,23 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); return ret; } static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; - btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode); - - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD); if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777); @@ -2647,32 +2654,26 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); return ret; } static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; if (sctx->proto < 2) return 0; - btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr); - - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR); if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr); @@ -2680,30 +2681,23 @@ static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); return ret; } static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; - btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu", - ino, uid, gid); - - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN); if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid); TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid); @@ -2712,13 +2706,12 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); return ret; } static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p = NULL; struct btrfs_inode_item *ii; @@ -2727,11 +2720,9 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) struct btrfs_key key; int slot; - btrfs_debug(fs_info, "send_utimes %llu", ino); - - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); path = alloc_path_for_send(); if (!path) { @@ -2756,9 +2747,6 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime); @@ -2770,7 +2758,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); btrfs_free_path(path); return ret; } @@ -2838,7 +2826,6 @@ static int trim_dir_utimes_cache(struct send_ctx *sctx) */ static int send_create_inode(struct send_ctx *sctx, u64 ino) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; int cmd; @@ -2847,8 +2834,6 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino) u64 mode; u64 rdev; - btrfs_debug(fs_info, "send_create_inode %llu", ino); - p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3075,7 +3060,7 @@ static void __free_recorded_refs(struct list_head *head) struct recorded_ref *cur; while (!list_empty(head)) { - cur = list_entry(head->next, struct recorded_ref, list); + cur = list_first_entry(head, struct recorded_ref, list); recorded_ref_free(cur); } } @@ -3106,6 +3091,11 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen, goto out; ret = send_rename(sctx, path, orphan); + if (ret < 0) + goto out; + + if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen) + ret = fs_path_copy(&sctx->cur_inode_path, orphan); out: fs_path_free(orphan); @@ -4158,6 +4148,23 @@ out: return ret; } +static int rename_current_inode(struct send_ctx *sctx, + struct fs_path *current_path, + struct fs_path *new_path) +{ + int ret; + + ret = send_rename(sctx, current_path, new_path); + if (ret < 0) + return ret; + + ret = fs_path_copy(&sctx->cur_inode_path, new_path); + if (ret < 0) + return ret; + + return fs_path_copy(current_path, new_path); +} + /* * This does all the move/link/unlink/rmdir magic. */ @@ -4172,15 +4179,13 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) u64 ow_inode = 0; u64 ow_gen; u64 ow_mode; - int did_overwrite = 0; - int is_orphan = 0; u64 last_dir_ino_rm = 0; + bool did_overwrite = false; + bool is_orphan = false; bool can_rename = true; bool orphanized_dir = false; bool orphanized_ancestor = false; - btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino); - /* * This should never happen as the root dir always has the same ref * which is always '..' @@ -4216,14 +4221,14 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (ret < 0) goto out; if (ret) - did_overwrite = 1; + did_overwrite = true; } if (sctx->cur_inode_new || did_overwrite) { ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, valid_path); if (ret < 0) goto out; - is_orphan = 1; + is_orphan = true; } else { ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, valid_path); @@ -4348,6 +4353,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (ret > 0) { orphanized_ancestor = true; fs_path_reset(valid_path); + fs_path_reset(&sctx->cur_inode_path); ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, valid_path); @@ -4443,13 +4449,10 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * it depending on the inode mode. */ if (is_orphan && can_rename) { - ret = send_rename(sctx, valid_path, cur->full_path); - if (ret < 0) - goto out; - is_orphan = 0; - ret = fs_path_copy(valid_path, cur->full_path); + ret = rename_current_inode(sctx, valid_path, cur->full_path); if (ret < 0) goto out; + is_orphan = false; } else if (can_rename) { if (S_ISDIR(sctx->cur_inode_mode)) { /* @@ -4457,10 +4460,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * dirs, we always have one new and one deleted * ref. The deleted ref is ignored later. */ - ret = send_rename(sctx, valid_path, - cur->full_path); - if (!ret) - ret = fs_path_copy(valid_path, + ret = rename_current_inode(sctx, valid_path, cur->full_path); if (ret < 0) goto out; @@ -4507,7 +4507,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) sctx->cur_inode_gen, valid_path); if (ret < 0) goto out; - is_orphan = 1; + is_orphan = true; } list_for_each_entry(cur, &sctx->deleted_refs, list) { @@ -4520,8 +4520,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) /* * We have a moved dir. Add the old parent to check_dirs */ - cur = list_entry(sctx->deleted_refs.next, struct recorded_ref, - list); + cur = list_first_entry(&sctx->deleted_refs, struct recorded_ref, list); ret = dup_ref(cur, &check_dirs); if (ret < 0) goto out; @@ -4553,6 +4552,8 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) ret = send_unlink(sctx, cur->full_path); if (ret < 0) goto out; + if (is_current_inode_path(sctx, cur->full_path)) + fs_path_reset(&sctx->cur_inode_path); } ret = dup_ref(cur, &check_dirs); if (ret < 0) @@ -4628,7 +4629,6 @@ static int rbtree_ref_comp(const void *k, const struct rb_node *node) { const struct recorded_ref *data = k; const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node); - int result; if (data->dir > ref->dir) return 1; @@ -4642,12 +4642,7 @@ static int rbtree_ref_comp(const void *k, const struct rb_node *node) return 1; if (data->name_len < ref->name_len) return -1; - result = strcmp(data->name, ref->name); - if (result > 0) - return 1; - if (result < 0) - return -1; - return 0; + return strcmp(data->name, ref->name); } static bool rbtree_ref_less(struct rb_node *node, const struct rb_node *parent) @@ -4701,7 +4696,7 @@ out: static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) { - int ret = 0; + int ret; struct send_ctx *sctx = ctx; struct rb_node *node = NULL; struct recorded_ref data; @@ -4710,7 +4705,7 @@ static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) ret = get_inode_gen(sctx->send_root, dir, &dir_gen); if (ret < 0) - goto out; + return ret; data.dir = dir; data.dir_gen = dir_gen; @@ -4724,13 +4719,13 @@ static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) &sctx->new_refs, name, dir, dir_gen, sctx); } -out: + return ret; } static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) { - int ret = 0; + int ret; struct send_ctx *sctx = ctx; struct rb_node *node = NULL; struct recorded_ref data; @@ -4739,7 +4734,7 @@ static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx ret = get_inode_gen(sctx->parent_root, dir, &dir_gen); if (ret < 0) - goto out; + return ret; data.dir = dir; data.dir_gen = dir_gen; @@ -4753,7 +4748,7 @@ static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx &sctx->deleted_refs, name, dir, dir_gen, sctx); } -out: + return ret; } @@ -4764,11 +4759,9 @@ static int record_new_ref(struct send_ctx *sctx) ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, 0, record_new_ref_if_needed, sctx); if (ret < 0) - goto out; - ret = 0; + return ret; -out: - return ret; + return 0; } static int record_deleted_ref(struct send_ctx *sctx) @@ -4779,29 +4772,25 @@ static int record_deleted_ref(struct send_ctx *sctx) sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx); if (ret < 0) - goto out; - ret = 0; + return ret; -out: - return ret; + return 0; } static int record_changed_ref(struct send_ctx *sctx) { - int ret = 0; + int ret; ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, 0, record_new_ref_if_needed, sctx); if (ret < 0) - goto out; + return ret; ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx); if (ret < 0) - goto out; - ret = 0; + return ret; -out: - return ret; + return 0; } /* @@ -4869,15 +4858,19 @@ out: } static int send_set_xattr(struct send_ctx *sctx, - struct fs_path *path, const char *name, int name_len, const char *data, int data_len) { - int ret = 0; + struct fs_path *path; + int ret; + + path = get_cur_inode_path(sctx); + if (IS_ERR(path)) + return PTR_ERR(path); ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); @@ -4886,7 +4879,6 @@ static int send_set_xattr(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -4894,11 +4886,11 @@ static int send_remove_xattr(struct send_ctx *sctx, struct fs_path *path, const char *name, int name_len) { - int ret = 0; + int ret; ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); @@ -4906,7 +4898,6 @@ static int send_remove_xattr(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -4914,19 +4905,13 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, void *ctx) { - int ret; struct send_ctx *sctx = ctx; - struct fs_path *p; struct posix_acl_xattr_header dummy_acl; /* Capabilities are emitted by finish_inode_if_needed */ if (!strncmp(name, XATTR_NAME_CAPS, name_len)) return 0; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - /* * This hack is needed because empty acls are stored as zero byte * data in xattrs. Problem with that is, that receiving these zero byte @@ -4943,48 +4928,27 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, } } - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; - - ret = send_set_xattr(sctx, p, name, name_len, data, data_len); - -out: - fs_path_free(p); - return ret; + return send_set_xattr(sctx, name, name_len, data, data_len); } static int __process_deleted_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, void *ctx) { - int ret; struct send_ctx *sctx = ctx; struct fs_path *p; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) + return PTR_ERR(p); - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; - - ret = send_remove_xattr(sctx, p, name, name_len); - -out: - fs_path_free(p); - return ret; + return send_remove_xattr(sctx, p, name, name_len); } static int process_new_xattr(struct send_ctx *sctx) { - int ret = 0; - - ret = iterate_dir_item(sctx->send_root, sctx->left_path, - __process_new_xattr, sctx); - - return ret; + return iterate_dir_item(sctx->send_root, sctx->left_path, + __process_new_xattr, sctx); } static int process_deleted_xattr(struct send_ctx *sctx) @@ -5100,17 +5064,15 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key, static int process_changed_xattr(struct send_ctx *sctx) { - int ret = 0; + int ret; ret = iterate_dir_item(sctx->send_root, sctx->left_path, __process_changed_new_xattr, sctx); if (ret < 0) - goto out; - ret = iterate_dir_item(sctx->parent_root, sctx->right_path, - __process_changed_deleted_xattr, sctx); + return ret; -out: - return ret; + return iterate_dir_item(sctx->parent_root, sctx->right_path, + __process_changed_deleted_xattr, sctx); } static int process_all_new_xattrs(struct send_ctx *sctx) @@ -5157,7 +5119,7 @@ static int send_verity(struct send_ctx *sctx, struct fs_path *path, ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM, @@ -5172,21 +5134,20 @@ static int send_verity(struct send_ctx *sctx, struct fs_path *path, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } static int process_verity(struct send_ctx *sctx) { int ret = 0; - struct inode *inode; + struct btrfs_inode *inode; struct fs_path *p; inode = btrfs_iget(sctx->cur_ino, sctx->send_root); if (IS_ERR(inode)) return PTR_ERR(inode); - ret = btrfs_get_verity_descriptor(inode, NULL, 0); + ret = btrfs_get_verity_descriptor(&inode->vfs_inode, NULL, 0); if (ret < 0) goto iput; @@ -5203,27 +5164,19 @@ static int process_verity(struct send_ctx *sctx) } } - ret = btrfs_get_verity_descriptor(inode, sctx->verity_descriptor, ret); + ret = btrfs_get_verity_descriptor(&inode->vfs_inode, sctx->verity_descriptor, ret); if (ret < 0) goto iput; - p = fs_path_alloc(); - if (!p) { - ret = -ENOMEM; + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) { + ret = PTR_ERR(p); goto iput; } - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto free_path; ret = send_verity(sctx, p, sctx->verity_descriptor); - if (ret < 0) - goto free_path; - -free_path: - fs_path_free(p); iput: - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -5263,10 +5216,9 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; - struct folio *folio; - pgoff_t index = offset >> PAGE_SHIFT; - pgoff_t last_index; - unsigned pg_offset = offset_in_page(offset); + u64 cur = offset; + const u64 end = offset + len; + const pgoff_t last_index = ((end - 1) >> PAGE_SHIFT); struct address_space *mapping = sctx->cur_inode->i_mapping; int ret; @@ -5274,11 +5226,11 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) if (ret) return ret; - last_index = (offset + len - 1) >> PAGE_SHIFT; - - while (index <= last_index) { - unsigned cur_len = min_t(unsigned, len, - PAGE_SIZE - pg_offset); + while (cur < end) { + pgoff_t index = (cur >> PAGE_SHIFT); + unsigned int cur_len; + unsigned int pg_offset; + struct folio *folio; folio = filemap_lock_folio(mapping, index); if (IS_ERR(folio)) { @@ -5292,8 +5244,8 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) break; } } - - WARN_ON(folio_order(folio)); + pg_offset = offset_in_folio(folio, cur); + cur_len = min_t(unsigned int, end - cur, folio_size(folio) - pg_offset); if (folio_test_readahead(folio)) page_cache_async_readahead(mapping, &sctx->ra, NULL, folio, @@ -5312,15 +5264,18 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) ret = -EIO; break; } + if (folio->mapping != mapping) { + folio_unlock(folio); + folio_put(folio); + continue; + } } memcpy_from_folio(sctx->send_buf + sctx->send_size, folio, pg_offset, cur_len); folio_unlock(folio); folio_put(folio); - index++; - pg_offset = 0; - len -= cur_len; + cur += cur_len; sctx->send_size += cur_len; } @@ -5333,35 +5288,26 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) */ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - - btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len); + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) - goto out; - - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); ret = put_file_data(sctx, offset, len); if (ret < 0) - goto out; + return ret; ret = send_cmd(sctx); tlv_put_failure: -out: - fs_path_free(p); return ret; } @@ -5374,12 +5320,12 @@ static int send_clone(struct send_ctx *sctx, { int ret = 0; struct fs_path *p; + struct fs_path *cur_inode_path; u64 gen; - btrfs_debug(sctx->send_root->fs_info, - "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu", - offset, len, btrfs_root_id(clone_root->root), - clone_root->ino, clone_root->offset); + cur_inode_path = get_cur_inode_path(sctx); + if (IS_ERR(cur_inode_path)) + return PTR_ERR(cur_inode_path); p = fs_path_alloc(); if (!p) @@ -5389,13 +5335,9 @@ static int send_clone(struct send_ctx *sctx, if (ret < 0) goto out; - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; - TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len); - TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, cur_inode_path); if (clone_root->root == sctx->send_root) { ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen); @@ -5446,27 +5388,45 @@ static int send_update_extent(struct send_ctx *sctx, int ret = 0; struct fs_path *p; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT); if (ret < 0) - goto out; + return ret; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len); - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + ret = send_cmd(sctx); + +tlv_put_failure: + return ret; +} + +static int send_fallocate(struct send_ctx *sctx, u32 mode, u64 offset, u64 len) +{ + struct fs_path *path; + int ret; + + path = get_cur_inode_path(sctx); + if (IS_ERR(path)) + return PTR_ERR(path); + + ret = begin_cmd(sctx, BTRFS_SEND_C_FALLOCATE); if (ret < 0) - goto out; + return ret; - TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + TLV_PUT_U32(sctx, BTRFS_SEND_A_FALLOCATE_MODE, mode); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len); ret = send_cmd(sctx); tlv_put_failure: -out: - fs_path_free(p); return ret; } @@ -5478,6 +5438,14 @@ static int send_hole(struct send_ctx *sctx, u64 end) int ret = 0; /* + * Starting with send stream v2 we have fallocate and can use it to + * punch holes instead of sending writes full of zeroes. + */ + if (proto_cmd_ok(sctx, BTRFS_SEND_C_FALLOCATE)) + return send_fallocate(sctx, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + offset, end - offset); + + /* * A hole that starts at EOF or beyond it. Since we do not yet support * fallocate (for extent preallocation and hole punching), sending a * write of zeroes starting at EOF or beyond would later require issuing @@ -5495,12 +5463,10 @@ static int send_hole(struct send_ctx *sctx, u64 end) if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, end - offset); - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto tlv_put_failure; + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) + return PTR_ERR(p); + while (offset < end) { u64 len = min(end - offset, read_size); @@ -5521,7 +5487,6 @@ static int send_hole(struct send_ctx *sctx, u64 end) } sctx->cur_inode_next_write_offset = offset; tlv_put_failure: - fs_path_free(p); return ret; } @@ -5529,9 +5494,7 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, struct btrfs_path *path, u64 offset, u64 len) { - struct btrfs_root *root = sctx->send_root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct inode *inode; + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; struct fs_path *fspath; struct extent_buffer *leaf = path->nodes[0]; struct btrfs_key key; @@ -5540,23 +5503,13 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, size_t inline_size; int ret; - inode = btrfs_iget(sctx->cur_ino, root); - if (IS_ERR(inode)) - return PTR_ERR(inode); - - fspath = fs_path_alloc(); - if (!fspath) { - ret = -ENOMEM; - goto out; - } + fspath = get_cur_inode_path(sctx); + if (IS_ERR(fspath)) + return PTR_ERR(fspath); ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE); if (ret < 0) - goto out; - - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); - if (ret < 0) - goto out; + return ret; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -5572,12 +5525,12 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, ret = btrfs_encoded_io_compression_from_extent(fs_info, btrfs_file_extent_compression(leaf, ei)); if (ret < 0) - goto out; + return ret; TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret); ret = put_data_header(sctx, inline_size); if (ret < 0) - goto out; + return ret; read_extent_buffer(leaf, sctx->send_buf + sctx->send_size, btrfs_file_extent_inline_start(ei), inline_size); sctx->send_size += inline_size; @@ -5585,9 +5538,6 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: - fs_path_free(fspath); - iput(inode); return ret; } @@ -5596,7 +5546,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; - struct inode *inode; + struct btrfs_inode *inode; struct fs_path *fspath; struct extent_buffer *leaf = path->nodes[0]; struct btrfs_key key; @@ -5611,9 +5561,9 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, if (IS_ERR(inode)) return PTR_ERR(inode); - fspath = fs_path_alloc(); - if (!fspath) { - ret = -ENOMEM; + fspath = get_cur_inode_path(sctx); + if (IS_ERR(fspath)) { + ret = PTR_ERR(fspath); goto out; } @@ -5621,10 +5571,6 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, if (ret < 0) goto out; - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); - if (ret < 0) - goto out; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); @@ -5666,7 +5612,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, * Note that send_buf is a mapping of send_buf_pages, so this is really * reading into send_buf. */ - ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), + ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, disk_num_bytes, sctx->send_buf_pages + (data_offset >> PAGE_SHIFT), @@ -5692,8 +5638,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, tlv_put_failure: out: - fs_path_free(fspath); - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -5735,15 +5680,14 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, } if (sctx->cur_inode == NULL) { + struct btrfs_inode *btrfs_inode; struct btrfs_root *root = sctx->send_root; - sctx->cur_inode = btrfs_iget(sctx->cur_ino, root); - if (IS_ERR(sctx->cur_inode)) { - int err = PTR_ERR(sctx->cur_inode); + btrfs_inode = btrfs_iget(sctx->cur_ino, root); + if (IS_ERR(btrfs_inode)) + return PTR_ERR(btrfs_inode); - sctx->cur_inode = NULL; - return err; - } + sctx->cur_inode = &btrfs_inode->vfs_inode; memset(&sctx->ra, 0, sizeof(struct file_ra_state)); file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping); @@ -5822,7 +5766,6 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, */ static int send_capabilities(struct send_ctx *sctx) { - struct fs_path *fspath = NULL; struct btrfs_path *path; struct btrfs_dir_item *di; struct extent_buffer *leaf; @@ -5848,25 +5791,19 @@ static int send_capabilities(struct send_ctx *sctx) leaf = path->nodes[0]; buf_len = btrfs_dir_data_len(leaf, di); - fspath = fs_path_alloc(); buf = kmalloc(buf_len, GFP_KERNEL); - if (!fspath || !buf) { + if (!buf) { ret = -ENOMEM; goto out; } - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); - if (ret < 0) - goto out; - data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di); read_extent_buffer(leaf, buf, data_ptr, buf_len); - ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS, + ret = send_set_xattr(sctx, XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), buf, buf_len); out: kfree(buf); - fs_path_free(fspath); btrfs_free_path(path); return ret; } @@ -6892,6 +6829,7 @@ static int changed_inode(struct send_ctx *sctx, sctx->cur_inode_last_extent = (u64)-1; sctx->cur_inode_next_write_offset = 0; sctx->ignore_cur_inode = false; + fs_path_reset(&sctx->cur_inode_path); /* * Set send_progress to current inode. This will tell all get_cur_xxx @@ -7253,7 +7191,7 @@ static int changed_cb(struct btrfs_path *left_path, enum btrfs_compare_tree_result result, struct send_ctx *sctx) { - int ret = 0; + int ret; /* * We can not hold the commit root semaphore here. This is because in @@ -7313,7 +7251,6 @@ static int changed_cb(struct btrfs_path *left_path, return 0; } result = BTRFS_COMPARE_TREE_CHANGED; - ret = 0; } sctx->left_path = left_path; @@ -8102,10 +8039,9 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root) btrfs_root_id(root), root->dedupe_in_progress); } -long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg) +long btrfs_ioctl_send(struct btrfs_root *send_root, const struct btrfs_ioctl_send_args *arg) { int ret = 0; - struct btrfs_root *send_root = inode->root; struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_root *clone_root; struct send_ctx *sctx = NULL; @@ -8168,6 +8104,7 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a goto out; } + init_path(&sctx->cur_inode_path); INIT_LIST_HEAD(&sctx->new_refs); INIT_LIST_HEAD(&sctx->deleted_refs); @@ -8444,6 +8381,9 @@ out: btrfs_lru_cache_clear(&sctx->dir_created_cache); btrfs_lru_cache_clear(&sctx->dir_utimes_cache); + if (sctx->cur_inode_path.buf != sctx->cur_inode_path.inline_buf) + kfree(sctx->cur_inode_path.buf); + kfree(sctx); } diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 9309886c5ea1..652bb28f63d4 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -11,7 +11,7 @@ #include <linux/sizes.h> #include <linux/align.h> -struct btrfs_inode; +struct btrfs_root; struct btrfs_ioctl_send_args; #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" @@ -182,6 +182,6 @@ enum { __BTRFS_SEND_A_MAX = 35, }; -long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg); +long btrfs_ioctl_send(struct btrfs_root *send_root, const struct btrfs_ioctl_send_args *arg); #endif diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 255e85f78313..0481c693ac2e 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include "linux/spinlock.h" +#include <linux/spinlock.h> #include <linux/minmax.h> #include "misc.h" #include "ctree.h" @@ -14,6 +14,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "zoned.h" /* * HOW DOES SPACE RESERVATION WORK @@ -49,11 +50,11 @@ * num_bytes we want to reserve. * * ->reserve - * space_info->bytes_may_reserve += num_bytes + * space_info->bytes_may_use += num_bytes * * ->extent allocation * Call btrfs_add_reserved_bytes() which does - * space_info->bytes_may_reserve -= num_bytes + * space_info->bytes_may_use -= num_bytes * space_info->bytes_reserved += extent_bytes * * ->insert reference @@ -127,6 +128,14 @@ * churn a lot and we can avoid making some extent tree modifications if we * are able to delay for as long as possible. * + * RESET_ZONES + * This state works only for the zoned mode. On the zoned mode, we cannot + * reuse once allocated then freed region until we reset the zone, due to + * the sequential write zone requirement. The RESET_ZONES state resets the + * zones of an unused block group and let us reuse the space. The reusing + * is faster than removing the block group and allocating another block + * group on the zones. + * * ALLOC_CHUNK * We will skip this the first time through space reservation, because of * overcommit and we don't want to have a lot of useless metadata space when @@ -225,19 +234,11 @@ void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, WRITE_ONCE(space_info->chunk_size, chunk_size); } -static int create_space_info(struct btrfs_fs_info *info, u64 flags) +static void init_space_info(struct btrfs_fs_info *info, + struct btrfs_space_info *space_info, u64 flags) { - - struct btrfs_space_info *space_info; - int i; - int ret; - - space_info = kzalloc(sizeof(*space_info), GFP_NOFS); - if (!space_info) - return -ENOMEM; - space_info->fs_info = info; - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++) INIT_LIST_HEAD(&space_info->block_groups[i]); init_rwsem(&space_info->groups_sem); spin_lock_init(&space_info->lock); @@ -248,9 +249,64 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) INIT_LIST_HEAD(&space_info->priority_tickets); space_info->clamp = 1; btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags)); + space_info->subgroup_id = BTRFS_SUB_GROUP_PRIMARY; if (btrfs_is_zoned(info)) space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH; +} + +static int create_space_info_sub_group(struct btrfs_space_info *parent, u64 flags, + enum btrfs_space_info_sub_group id, int index) +{ + struct btrfs_fs_info *fs_info = parent->fs_info; + struct btrfs_space_info *sub_group; + int ret; + + ASSERT(parent->subgroup_id == BTRFS_SUB_GROUP_PRIMARY); + ASSERT(id != BTRFS_SUB_GROUP_PRIMARY); + + sub_group = kzalloc(sizeof(*sub_group), GFP_NOFS); + if (!sub_group) + return -ENOMEM; + + init_space_info(fs_info, sub_group, flags); + parent->sub_group[index] = sub_group; + sub_group->parent = parent; + sub_group->subgroup_id = id; + + ret = btrfs_sysfs_add_space_info_type(fs_info, sub_group); + if (ret) { + kfree(sub_group); + parent->sub_group[index] = NULL; + } + return ret; +} + +static int create_space_info(struct btrfs_fs_info *info, u64 flags) +{ + + struct btrfs_space_info *space_info; + int ret = 0; + + space_info = kzalloc(sizeof(*space_info), GFP_NOFS); + if (!space_info) + return -ENOMEM; + + init_space_info(info, space_info, flags); + + if (btrfs_is_zoned(info)) { + if (flags & BTRFS_BLOCK_GROUP_DATA) + ret = create_space_info_sub_group(space_info, flags, + BTRFS_SUB_GROUP_DATA_RELOC, + 0); + else if (flags & BTRFS_BLOCK_GROUP_METADATA) + ret = create_space_info_sub_group(space_info, flags, + BTRFS_SUB_GROUP_TREELOG, + 0); + + if (ret) + return ret; + } ret = btrfs_sysfs_add_space_info_type(info, space_info); if (ret) @@ -303,31 +359,29 @@ out: void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, struct btrfs_block_group *block_group) { - struct btrfs_space_info *found; + struct btrfs_space_info *space_info = block_group->space_info; int factor, index; factor = btrfs_bg_type_to_factor(block_group->flags); - found = btrfs_find_space_info(info, block_group->flags); - ASSERT(found); - spin_lock(&found->lock); - found->total_bytes += block_group->length; - found->disk_total += block_group->length * factor; - found->bytes_used += block_group->used; - found->disk_used += block_group->used * factor; - found->bytes_readonly += block_group->bytes_super; - btrfs_space_info_update_bytes_zone_unusable(info, found, block_group->zone_unusable); + spin_lock(&space_info->lock); + space_info->total_bytes += block_group->length; + space_info->disk_total += block_group->length * factor; + space_info->bytes_used += block_group->used; + space_info->disk_used += block_group->used * factor; + space_info->bytes_readonly += block_group->bytes_super; + btrfs_space_info_update_bytes_zone_unusable(space_info, block_group->zone_unusable); if (block_group->length > 0) - found->full = 0; - btrfs_try_granting_tickets(info, found); - spin_unlock(&found->lock); + space_info->full = 0; + btrfs_try_granting_tickets(info, space_info); + spin_unlock(&space_info->lock); - block_group->space_info = found; + block_group->space_info = space_info; index = btrfs_bg_flags_to_raid_index(block_group->flags); - down_write(&found->groups_sem); - list_add_tail(&block_group->list, &found->block_groups[index]); - up_write(&found->groups_sem); + down_write(&space_info->groups_sem); + list_add_tail(&block_group->list, &space_info->block_groups[index]); + up_write(&space_info->groups_sem); } struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, @@ -489,9 +543,7 @@ again: if ((used + ticket->bytes <= space_info->total_bytes) || btrfs_can_overcommit(fs_info, space_info, ticket->bytes, flush)) { - btrfs_space_info_update_bytes_may_use(fs_info, - space_info, - ticket->bytes); + btrfs_space_info_update_bytes_may_use(space_info, ticket->bytes); remove_ticket(space_info, ticket); ticket->bytes = 0; space_info->tickets_id++; @@ -549,8 +601,9 @@ static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info, lockdep_assert_held(&info->lock); /* The free space could be negative in case of overcommit */ - btrfs_info(fs_info, "space_info %s has %lld free, is %sfull", - flag_str, + btrfs_info(fs_info, + "space_info %s (sub-group id %d) has %lld free, is %sfull", + flag_str, info->subgroup_id, (s64)(info->total_bytes - btrfs_space_info_used(info, true)), info->full ? "" : "not "); btrfs_info(fs_info, @@ -562,7 +615,7 @@ static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info, void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, - int dump_block_groups) + bool dump_block_groups) { struct btrfs_block_group *cache; u64 total_avail = 0; @@ -805,7 +858,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, ret = PTR_ERR(trans); break; } - ret = btrfs_chunk_alloc(trans, + ret = btrfs_chunk_alloc(trans, space_info, btrfs_get_alloc_profile(fs_info, space_info->flags), (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); @@ -834,6 +887,9 @@ static void flush_space(struct btrfs_fs_info *fs_info, */ ret = btrfs_commit_current_transaction(root); break; + case RESET_ZONES: + ret = btrfs_reset_unused_block_groups(space_info, num_bytes); + break; default: ret = -ENOSPC; break; @@ -1073,22 +1129,19 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, return (tickets_id != space_info->tickets_id); } -/* - * This is for normal flushers, we can wait all goddamned day if we want to. We - * will loop and continuously try to flush as long as we are making progress. - * We count progress as clearing off tickets each time we have to loop. - */ -static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info) { - struct btrfs_fs_info *fs_info; - struct btrfs_space_info *space_info; + struct btrfs_fs_info *fs_info = space_info->fs_info; u64 to_reclaim; enum btrfs_flush_state flush_state; int commit_cycles = 0; u64 last_tickets_id; + enum btrfs_flush_state final_state; - fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); - space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + if (btrfs_is_zoned(fs_info)) + final_state = RESET_ZONES; + else + final_state = COMMIT_TRANS; spin_lock(&space_info->lock); to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); @@ -1141,7 +1194,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) flush_state++; - if (flush_state > COMMIT_TRANS) { + if (flush_state > final_state) { commit_cycles++; if (commit_cycles > 2) { if (maybe_fail_all_tickets(fs_info, space_info)) { @@ -1155,7 +1208,26 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) } } spin_unlock(&space_info->lock); - } while (flush_state <= COMMIT_TRANS); + } while (flush_state <= final_state); +} + +/* + * This is for normal flushers, it can wait as much time as needed. We will + * loop and continuously try to flush as long as we are making progress. We + * count progress as clearing off tickets each time we have to loop. + */ +static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + + fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + do_async_reclaim_metadata_space(space_info); + for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) { + if (space_info->sub_group[i]) + do_async_reclaim_metadata_space(space_info->sub_group[i]); + } } /* @@ -1286,6 +1358,10 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) * This is where we reclaim all of the pinned space generated by running the * iputs * + * RESET_ZONES + * This state works only for the zoned mode. We scan the unused block group + * list and reset the zones and reuse the block group. + * * ALLOC_CHUNK_FORCE * For data we start with alloc chunk force, however we could have been full * before, and then the transaction commit could have freed new block groups, @@ -1295,19 +1371,16 @@ static const enum btrfs_flush_state data_flush_states[] = { FLUSH_DELALLOC_FULL, RUN_DELAYED_IPUTS, COMMIT_TRANS, + RESET_ZONES, ALLOC_CHUNK_FORCE, }; -static void btrfs_async_reclaim_data_space(struct work_struct *work) +static void do_async_reclaim_data_space(struct btrfs_space_info *space_info) { - struct btrfs_fs_info *fs_info; - struct btrfs_space_info *space_info; + struct btrfs_fs_info *fs_info = space_info->fs_info; u64 last_tickets_id; enum btrfs_flush_state flush_state = 0; - fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); - space_info = fs_info->data_sinfo; - spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = 0; @@ -1375,6 +1448,19 @@ aborted_fs: spin_unlock(&space_info->lock); } +static void btrfs_async_reclaim_data_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + + fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); + space_info = fs_info->data_sinfo; + do_async_reclaim_data_space(space_info); + for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) + if (space_info->sub_group[i]) + do_async_reclaim_data_space(space_info->sub_group[i]); +} + void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) { INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space); @@ -1386,6 +1472,7 @@ void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) static const enum btrfs_flush_state priority_flush_states[] = { FLUSH_DELAYED_ITEMS_NR, FLUSH_DELAYED_ITEMS, + RESET_ZONES, ALLOC_CHUNK, }; @@ -1399,6 +1486,7 @@ static const enum btrfs_flush_state evict_flush_states[] = { FLUSH_DELALLOC_FULL, ALLOC_CHUNK, COMMIT_TRANS, + RESET_ZONES, }; static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, @@ -1690,8 +1778,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, if (!pending_tickets && ((used + orig_bytes <= space_info->total_bytes) || btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { - btrfs_space_info_update_bytes_may_use(fs_info, space_info, - orig_bytes); + btrfs_space_info_update_bytes_may_use(space_info, orig_bytes); ret = 0; } @@ -1703,8 +1790,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) { used = btrfs_space_info_used(space_info, false); if (used + orig_bytes <= space_info->total_bytes) { - btrfs_space_info_update_bytes_may_use(fs_info, space_info, - orig_bytes); + btrfs_space_info_update_bytes_may_use(space_info, orig_bytes); ret = 0; } } @@ -1801,7 +1887,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, space_info->flags, orig_bytes, 1); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_dump_space_info(fs_info, space_info, orig_bytes, 0); + btrfs_dump_space_info(fs_info, space_info, orig_bytes, false); } return ret; } @@ -1816,10 +1902,10 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * This will reserve bytes from the data space info. If there is not enough * space then we will attempt to flush space as specified by flush. */ -int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, +int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { - struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; + struct btrfs_fs_info *fs_info = space_info->fs_info; int ret; ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA || @@ -1827,12 +1913,12 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, flush == BTRFS_RESERVE_NO_FLUSH); ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); - ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush); + ret = __reserve_bytes(fs_info, space_info, bytes, flush); if (ret == -ENOSPC) { trace_btrfs_space_reservation(fs_info, "space_info:enospc", - data_sinfo->flags, bytes, 1); + space_info->flags, bytes, 1); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0); + btrfs_dump_space_info(fs_info, space_info, bytes, false); } return ret; } @@ -1887,13 +1973,13 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) static u64 calc_pct_ratio(u64 x, u64 y) { - int err; + int ret; if (!y) return 0; again: - err = check_mul_overflow(100, x, &x); - if (err) + ret = check_mul_overflow(100, x, &x); + if (ret) goto lose_precision; return div64_u64(x, y); lose_precision: @@ -2053,7 +2139,7 @@ void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool } } -bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info) +static bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info) { bool ret; @@ -2082,3 +2168,32 @@ void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info) do_reclaim_sweep(space_info, raid); } } + +void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + + lockdep_assert_held(&space_info->lock); + + /* Prioritize the global reservation to receive the freed space. */ + if (global_rsv->space_info != space_info) + goto grant; + + spin_lock(&global_rsv->lock); + if (!global_rsv->full) { + u64 to_add = min(len, global_rsv->size - global_rsv->reserved); + + global_rsv->reserved += to_add; + btrfs_space_info_update_bytes_may_use(space_info, to_add); + if (global_rsv->reserved >= global_rsv->size) + global_rsv->full = 1; + len -= to_add; + } + spin_unlock(&global_rsv->lock); + +grant: + /* Add to any tickets we may have. */ + if (len) + btrfs_try_granting_tickets(fs_info, space_info); +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index efbecc0c5258..679f22efb407 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -79,6 +79,10 @@ enum btrfs_reserve_flush_enum { BTRFS_RESERVE_FLUSH_EMERGENCY, }; +/* + * Please be aware that the order of enum values will be the order of the reclaim + * process in btrfs_async_reclaim_metadata_space(). + */ enum btrfs_flush_state { FLUSH_DELAYED_ITEMS_NR = 1, FLUSH_DELAYED_ITEMS = 2, @@ -91,10 +95,21 @@ enum btrfs_flush_state { ALLOC_CHUNK_FORCE = 9, RUN_DELAYED_IPUTS = 10, COMMIT_TRANS = 11, + RESET_ZONES = 12, +}; + +enum btrfs_space_info_sub_group { + BTRFS_SUB_GROUP_PRIMARY, + BTRFS_SUB_GROUP_DATA_RELOC, + BTRFS_SUB_GROUP_TREELOG, }; +#define BTRFS_SPACE_INFO_SUB_GROUP_MAX 1 struct btrfs_space_info { struct btrfs_fs_info *fs_info; + struct btrfs_space_info *parent; + struct btrfs_space_info *sub_group[BTRFS_SPACE_INFO_SUB_GROUP_MAX]; + int subgroup_id; spinlock_t lock; u64 total_bytes; /* total bytes in the space, @@ -229,10 +244,10 @@ static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_i */ #define DECLARE_SPACE_INFO_UPDATE(name, trace_name) \ static inline void \ -btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \ - struct btrfs_space_info *sinfo, \ +btrfs_space_info_update_##name(struct btrfs_space_info *sinfo, \ s64 bytes) \ { \ + struct btrfs_fs_info *fs_info = sinfo->fs_info; \ const u64 abs_bytes = (bytes < 0) ? -bytes : bytes; \ lockdep_assert_held(&sinfo->lock); \ trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \ @@ -263,7 +278,7 @@ u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, void btrfs_clear_space_info_full(struct btrfs_fs_info *info); void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, - int dump_block_groups); + bool dump_block_groups); int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 orig_bytes, @@ -275,16 +290,15 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush); static inline void btrfs_space_info_free_bytes_may_use( - struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes) { spin_lock(&space_info->lock); - btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes); - btrfs_try_granting_tickets(fs_info, space_info); + btrfs_space_info_update_bytes_may_use(space_info, -num_bytes); + btrfs_try_granting_tickets(space_info->fs_info, space_info); spin_unlock(&space_info->lock); } -int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, +int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush); void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info); void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); @@ -292,8 +306,8 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes); void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready); -bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info); int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info); void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info); +void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len); #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 8c68059ac1b0..c9b3821957f7 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -2,12 +2,11 @@ #include <linux/slab.h> #include "messages.h" -#include "ctree.h" #include "subpage.h" #include "btrfs_inode.h" /* - * Subpage (sectorsize < PAGE_SIZE) support overview: + * Subpage (block size < folio size) support overview: * * Limitations: * @@ -50,7 +49,7 @@ * Implementation: * * - Common - * Both metadata and data will use a new structure, btrfs_subpage, to + * Both metadata and data will use a new structure, btrfs_folio_state, to * record the status of each sector inside a page. This provides the extra * granularity needed. * @@ -64,34 +63,14 @@ * This means a slightly higher tree locking latency. */ -#if PAGE_SIZE > SZ_4K -bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping) +int btrfs_attach_folio_state(const struct btrfs_fs_info *fs_info, + struct folio *folio, enum btrfs_folio_type type) { - if (fs_info->sectorsize >= PAGE_SIZE) - return false; + struct btrfs_folio_state *bfs; - /* - * Only data pages (either through DIO or compression) can have no - * mapping. And if page->mapping->host is data inode, it's subpage. - * As we have ruled our sectorsize >= PAGE_SIZE case already. - */ - if (!mapping || !mapping->host || is_data_inode(BTRFS_I(mapping->host))) - return true; - - /* - * Now the only remaining case is metadata, which we only go subpage - * routine if nodesize < PAGE_SIZE. - */ - if (fs_info->nodesize < PAGE_SIZE) - return true; - return false; -} -#endif - -int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, - struct folio *folio, enum btrfs_subpage_type type) -{ - struct btrfs_subpage *subpage; + /* For metadata we don't support large folio yet. */ + if (type == BTRFS_SUBPAGE_METADATA) + ASSERT(!folio_test_large(folio)); /* * We have cases like a dummy extent buffer page, which is not mapped @@ -101,40 +80,50 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, ASSERT(folio_test_locked(folio)); /* Either not subpage, or the folio already has private attached. */ - if (!btrfs_is_subpage(fs_info, folio->mapping) || folio_test_private(folio)) + if (folio_test_private(folio)) + return 0; + if (type == BTRFS_SUBPAGE_METADATA && !btrfs_meta_is_subpage(fs_info)) + return 0; + if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio)) return 0; - subpage = btrfs_alloc_subpage(fs_info, type); - if (IS_ERR(subpage)) - return PTR_ERR(subpage); + bfs = btrfs_alloc_folio_state(fs_info, folio_size(folio), type); + if (IS_ERR(bfs)) + return PTR_ERR(bfs); - folio_attach_private(folio, subpage); + folio_attach_private(folio, bfs); return 0; } -void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio) +void btrfs_detach_folio_state(const struct btrfs_fs_info *fs_info, struct folio *folio, + enum btrfs_folio_type type) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; /* Either not subpage, or the folio already has private attached. */ - if (!btrfs_is_subpage(fs_info, folio->mapping) || !folio_test_private(folio)) + if (!folio_test_private(folio)) + return; + if (type == BTRFS_SUBPAGE_METADATA && !btrfs_meta_is_subpage(fs_info)) + return; + if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio)) return; - subpage = folio_detach_private(folio); - ASSERT(subpage); - btrfs_free_subpage(subpage); + bfs = folio_detach_private(folio); + ASSERT(bfs); + btrfs_free_folio_state(bfs); } -struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, - enum btrfs_subpage_type type) +struct btrfs_folio_state *btrfs_alloc_folio_state(const struct btrfs_fs_info *fs_info, + size_t fsize, enum btrfs_folio_type type) { - struct btrfs_subpage *ret; + struct btrfs_folio_state *ret; unsigned int real_size; - ASSERT(fs_info->sectorsize < PAGE_SIZE); + ASSERT(fs_info->sectorsize < fsize); real_size = struct_size(ret, bitmaps, - BITS_TO_LONGS(btrfs_bitmap_nr_max * fs_info->sectors_per_page)); + BITS_TO_LONGS(btrfs_bitmap_nr_max * + (fsize >> fs_info->sectorsize_bits))); ret = kzalloc(real_size, GFP_NOFS); if (!ret) return ERR_PTR(-ENOMEM); @@ -147,11 +136,6 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, return ret; } -void btrfs_free_subpage(struct btrfs_subpage *subpage) -{ - kfree(subpage); -} - /* * Increase the eb_refs of current subpage. * @@ -163,39 +147,36 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage) */ void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_meta_is_subpage(fs_info)) return; ASSERT(folio_test_private(folio) && folio->mapping); lockdep_assert_held(&folio->mapping->i_private_lock); - subpage = folio_get_private(folio); - atomic_inc(&subpage->eb_refs); + bfs = folio_get_private(folio); + atomic_inc(&bfs->eb_refs); } void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_meta_is_subpage(fs_info)) return; ASSERT(folio_test_private(folio) && folio->mapping); lockdep_assert_held(&folio->mapping->i_private_lock); - subpage = folio_get_private(folio); - ASSERT(atomic_read(&subpage->eb_refs)); - atomic_dec(&subpage->eb_refs); + bfs = folio_get_private(folio); + ASSERT(atomic_read(&bfs->eb_refs)); + atomic_dec(&bfs->eb_refs); } static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - /* For subpage support, the folio must be single page. */ - ASSERT(folio_order(folio) == 0); - /* Basic checks */ ASSERT(folio_test_private(folio) && folio_get_private(folio)); ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && @@ -205,17 +186,20 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, * unmapped page like dummy extent buffer pages. */ if (folio->mapping) - ASSERT(folio_pos(folio) <= start && - start + len <= folio_pos(folio) + PAGE_SIZE); + ASSERT(folio_pos(folio) <= start && start + len <= folio_end(folio), + "start=%llu len=%u folio_pos=%llu folio_size=%zu", + start, len, folio_pos(folio), folio_size(folio)); } #define subpage_calc_start_bit(fs_info, folio, name, start, len) \ ({ \ - unsigned int __start_bit; \ + unsigned int __start_bit; \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ \ btrfs_subpage_assert(fs_info, folio, start, len); \ - __start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ - __start_bit += fs_info->sectors_per_page * btrfs_bitmap_nr_##name; \ + __start_bit = offset_in_folio(folio, start) >> fs_info->sectorsize_bits; \ + __start_bit += blocks_per_folio * btrfs_bitmap_nr_##name; \ __start_bit; \ }) @@ -233,14 +217,13 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) if (folio_pos(folio) >= orig_start + orig_len) *len = 0; else - *len = min_t(u64, folio_pos(folio) + PAGE_SIZE, - orig_start + orig_len) - *start; + *len = min_t(u64, folio_end(folio), orig_start + orig_len) - *start; } static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); const int nbits = (len >> fs_info->sectorsize_bits); unsigned long flags; @@ -250,7 +233,7 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, btrfs_subpage_assert(fs_info, folio, start, len); - spin_lock_irqsave(&subpage->lock, flags); + spin_lock_irqsave(&bfs->lock, flags); /* * We have call sites passing @lock_page into * extent_clear_unlock_delalloc() for compression path. @@ -258,18 +241,18 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, * This @locked_page is locked by plain lock_page(), thus its * subpage::locked is 0. Handle them in a special way. */ - if (atomic_read(&subpage->nr_locked) == 0) { - spin_unlock_irqrestore(&subpage->lock, flags); + if (atomic_read(&bfs->nr_locked) == 0) { + spin_unlock_irqrestore(&bfs->lock, flags); return true; } - for_each_set_bit_from(bit, subpage->bitmaps, start_bit + nbits) { - clear_bit(bit, subpage->bitmaps); + for_each_set_bit_from(bit, bfs->bitmaps, start_bit + nbits) { + clear_bit(bit, bfs->bitmaps); cleared++; } - ASSERT(atomic_read(&subpage->nr_locked) >= cleared); - last = atomic_sub_and_test(cleared, &subpage->nr_locked); - spin_unlock_irqrestore(&subpage->lock, flags); + ASSERT(atomic_read(&bfs->nr_locked) >= cleared); + last = atomic_sub_and_test(cleared, &bfs->nr_locked); + spin_unlock_irqrestore(&bfs->lock, flags); return last; } @@ -292,11 +275,11 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); ASSERT(folio_test_locked(folio)); - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio)) { folio_unlock(folio); return; } @@ -308,7 +291,7 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, * Since we own the page lock, no one else could touch subpage::locked * and we are safe to do several atomic operations without spinlock. */ - if (atomic_read(&subpage->nr_locked) == 0) { + if (atomic_read(&bfs->nr_locked) == 0) { /* No subpage lock, locked by plain lock_page(). */ folio_unlock(folio); return; @@ -322,86 +305,99 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, unsigned long bitmap) { - struct btrfs_subpage *subpage = folio_get_private(folio); - const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked; + struct btrfs_folio_state *bfs = folio_get_private(folio); + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); + const int start_bit = blocks_per_folio * btrfs_bitmap_nr_locked; unsigned long flags; bool last = false; int cleared = 0; int bit; - if (!btrfs_is_subpage(fs_info, folio->mapping)) { + if (!btrfs_is_subpage(fs_info, folio)) { folio_unlock(folio); return; } - if (atomic_read(&subpage->nr_locked) == 0) { + if (atomic_read(&bfs->nr_locked) == 0) { /* No subpage lock, locked by plain lock_page(). */ folio_unlock(folio); return; } - spin_lock_irqsave(&subpage->lock, flags); - for_each_set_bit(bit, &bitmap, fs_info->sectors_per_page) { - if (test_and_clear_bit(bit + start_bit, subpage->bitmaps)) + spin_lock_irqsave(&bfs->lock, flags); + for_each_set_bit(bit, &bitmap, blocks_per_folio) { + if (test_and_clear_bit(bit + start_bit, bfs->bitmaps)) cleared++; } - ASSERT(atomic_read(&subpage->nr_locked) >= cleared); - last = atomic_sub_and_test(cleared, &subpage->nr_locked); - spin_unlock_irqrestore(&subpage->lock, flags); + ASSERT(atomic_read(&bfs->nr_locked) >= cleared); + last = atomic_sub_and_test(cleared, &bfs->nr_locked); + spin_unlock_irqrestore(&bfs->lock, flags); if (last) folio_unlock(folio); } -#define subpage_test_bitmap_all_set(fs_info, subpage, name) \ - bitmap_test_range_all_set(subpage->bitmaps, \ - fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ - fs_info->sectors_per_page) +#define subpage_test_bitmap_all_set(fs_info, folio, name) \ +({ \ + struct btrfs_folio_state *bfs = folio_get_private(folio); \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ + \ + bitmap_test_range_all_set(bfs->bitmaps, \ + blocks_per_folio * btrfs_bitmap_nr_##name, \ + blocks_per_folio); \ +}) -#define subpage_test_bitmap_all_zero(fs_info, subpage, name) \ - bitmap_test_range_all_zero(subpage->bitmaps, \ - fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ - fs_info->sectors_per_page) +#define subpage_test_bitmap_all_zero(fs_info, folio, name) \ +({ \ + struct btrfs_folio_state *bfs = folio_get_private(folio); \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ + \ + bitmap_test_range_all_zero(bfs->bitmaps, \ + blocks_per_folio * btrfs_bitmap_nr_##name, \ + blocks_per_folio); \ +}) void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, uptodate, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate)) + spin_lock_irqsave(&bfs->lock, flags); + bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_set(fs_info, folio, uptodate)) folio_mark_uptodate(folio); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, uptodate, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); folio_clear_uptodate(folio); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_unlock_irqrestore(&bfs->lock, flags); folio_mark_dirty(folio); } @@ -418,17 +414,17 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len); unsigned long flags; bool last = false; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_zero(fs_info, subpage, dirty)) + spin_lock_irqsave(&bfs->lock, flags); + bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_zero(fs_info, folio, dirty)) last = true; - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); return last; } @@ -445,91 +441,91 @@ void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info, void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, writeback, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (!folio_test_writeback(folio)) folio_start_writeback(folio); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, writeback, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) { + spin_lock_irqsave(&bfs->lock, flags); + bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_zero(fs_info, folio, writeback)) { ASSERT(folio_test_writeback(folio)); folio_end_writeback(folio); } - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, ordered, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); folio_set_ordered(folio); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, ordered, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered)) + spin_lock_irqsave(&bfs->lock, flags); + bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_zero(fs_info, folio, ordered)) folio_clear_ordered(folio); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, checked, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_set(fs_info, subpage, checked)) + spin_lock_irqsave(&bfs->lock, flags); + bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_set(fs_info, folio, checked)) folio_set_checked(folio); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, checked, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); folio_clear_checked(folio); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } /* @@ -540,16 +536,16 @@ void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info, bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ - struct btrfs_subpage *subpage = folio_get_private(folio); \ + struct btrfs_folio_state *bfs = folio_get_private(folio); \ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, \ name, start, len); \ unsigned long flags; \ bool ret; \ \ - spin_lock_irqsave(&subpage->lock, flags); \ - ret = bitmap_test_range_all_set(subpage->bitmaps, start_bit, \ + spin_lock_irqsave(&bfs->lock, flags); \ + ret = bitmap_test_range_all_set(bfs->bitmaps, start_bit, \ len >> fs_info->sectorsize_bits); \ - spin_unlock_irqrestore(&subpage->lock, flags); \ + spin_unlock_irqrestore(&bfs->lock, flags); \ return ret; \ } IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); @@ -569,7 +565,7 @@ void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) { \ + !btrfs_is_subpage(fs_info, folio)) { \ folio_set_func(folio); \ return; \ } \ @@ -579,7 +575,7 @@ void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) { \ + !btrfs_is_subpage(fs_info, folio)) { \ folio_clear_func(folio); \ return; \ } \ @@ -589,7 +585,7 @@ bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) \ + !btrfs_is_subpage(fs_info, folio)) \ return folio_test_func(folio); \ return btrfs_subpage_test_##name(fs_info, folio, start, len); \ } \ @@ -597,7 +593,7 @@ void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) { \ + !btrfs_is_subpage(fs_info, folio)) { \ folio_set_func(folio); \ return; \ } \ @@ -608,7 +604,7 @@ void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) { \ + !btrfs_is_subpage(fs_info, folio)) { \ folio_clear_func(folio); \ return; \ } \ @@ -619,10 +615,32 @@ bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) \ + !btrfs_is_subpage(fs_info, folio)) \ return folio_test_func(folio); \ btrfs_subpage_clamp_range(folio, &start, &len); \ return btrfs_subpage_test_##name(fs_info, folio, start, len); \ +} \ +void btrfs_meta_folio_set_##name(struct folio *folio, const struct extent_buffer *eb) \ +{ \ + if (!btrfs_meta_is_subpage(eb->fs_info)) { \ + folio_set_func(folio); \ + return; \ + } \ + btrfs_subpage_set_##name(eb->fs_info, folio, eb->start, eb->len); \ +} \ +void btrfs_meta_folio_clear_##name(struct folio *folio, const struct extent_buffer *eb) \ +{ \ + if (!btrfs_meta_is_subpage(eb->fs_info)) { \ + folio_clear_func(folio); \ + return; \ + } \ + btrfs_subpage_clear_##name(eb->fs_info, folio, eb->start, eb->len); \ +} \ +bool btrfs_meta_folio_test_##name(struct folio *folio, const struct extent_buffer *eb) \ +{ \ + if (!btrfs_meta_is_subpage(eb->fs_info)) \ + return folio_test_func(folio); \ + return btrfs_subpage_test_##name(eb->fs_info, folio, eb->start, eb->len); \ } IMPLEMENT_BTRFS_PAGE_OPS(uptodate, folio_mark_uptodate, folio_clear_uptodate, folio_test_uptodate); @@ -635,6 +653,31 @@ IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered, IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, folio_test_checked); +#define GET_SUBPAGE_BITMAP(fs_info, folio, name, dst) \ +{ \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ + const struct btrfs_folio_state *bfs = folio_get_private(folio); \ + \ + ASSERT(blocks_per_folio <= BITS_PER_LONG); \ + *dst = bitmap_read(bfs->bitmaps, \ + blocks_per_folio * btrfs_bitmap_nr_##name, \ + blocks_per_folio); \ +} + +#define SUBPAGE_DUMP_BITMAP(fs_info, folio, name, start, len) \ +{ \ + unsigned long bitmap; \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ + \ + GET_SUBPAGE_BITMAP(fs_info, folio, name, &bitmap); \ + btrfs_warn(fs_info, \ + "dumpping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \ + start, len, folio_pos(folio), \ + blocks_per_folio, &bitmap); \ +} + /* * Make sure not only the page dirty bit is cleared, but also subpage dirty bit * is cleared. @@ -642,7 +685,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; unsigned int start_bit; unsigned int nbits; unsigned long flags; @@ -650,18 +693,22 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) return; - if (!btrfs_is_subpage(fs_info, folio->mapping)) { + if (!btrfs_is_subpage(fs_info, folio)) { ASSERT(!folio_test_dirty(folio)); return; } start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len); nbits = len >> fs_info->sectorsize_bits; - subpage = folio_get_private(folio); - ASSERT(subpage); - spin_lock_irqsave(&subpage->lock, flags); - ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); - spin_unlock_irqrestore(&subpage->lock, flags); + bfs = folio_get_private(folio); + ASSERT(bfs); + spin_lock_irqsave(&bfs->lock, flags); + if (unlikely(!bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits))) { + SUBPAGE_DUMP_BITMAP(fs_info, folio, dirty, start, len); + ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits)); + } + ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits)); + spin_unlock_irqrestore(&bfs->lock, flags); } /* @@ -674,86 +721,103 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; unsigned long flags; unsigned int start_bit; unsigned int nbits; int ret; ASSERT(folio_test_locked(folio)); - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio)) return; - subpage = folio_get_private(folio); + bfs = folio_get_private(folio); start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); nbits = len >> fs_info->sectorsize_bits; - spin_lock_irqsave(&subpage->lock, flags); + spin_lock_irqsave(&bfs->lock, flags); /* Target range should not yet be locked. */ - ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); - bitmap_set(subpage->bitmaps, start_bit, nbits); - ret = atomic_add_return(nbits, &subpage->nr_locked); - ASSERT(ret <= fs_info->sectors_per_page); - spin_unlock_irqrestore(&subpage->lock, flags); + if (unlikely(!bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits))) { + SUBPAGE_DUMP_BITMAP(fs_info, folio, locked, start, len); + ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits)); + } + bitmap_set(bfs->bitmaps, start_bit, nbits); + ret = atomic_add_return(nbits, &bfs->nr_locked); + ASSERT(ret <= btrfs_blocks_per_folio(fs_info, folio)); + spin_unlock_irqrestore(&bfs->lock, flags); } -#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ -{ \ - const int sectors_per_page = fs_info->sectors_per_page; \ - \ - ASSERT(sectors_per_page < BITS_PER_LONG); \ - *dst = bitmap_read(subpage->bitmaps, \ - sectors_per_page * btrfs_bitmap_nr_##name, \ - sectors_per_page); \ +/* + * Clear the dirty flag for the folio. + * + * If the affected folio is no longer dirty, return true. Otherwise return false. + */ +bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct extent_buffer *eb) +{ + bool last; + + if (!btrfs_meta_is_subpage(eb->fs_info)) { + folio_clear_dirty_for_io(folio); + return true; + } + + last = btrfs_subpage_clear_and_test_dirty(eb->fs_info, folio, eb->start, eb->len); + if (last) { + folio_clear_dirty_for_io(folio); + return true; + } + return false; } void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage; - const u32 sectors_per_page = fs_info->sectors_per_page; + struct btrfs_folio_state *bfs; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); unsigned long uptodate_bitmap; unsigned long dirty_bitmap; unsigned long writeback_bitmap; unsigned long ordered_bitmap; unsigned long checked_bitmap; + unsigned long locked_bitmap; unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); - ASSERT(sectors_per_page > 1); - subpage = folio_get_private(folio); - - spin_lock_irqsave(&subpage->lock, flags); - GET_SUBPAGE_BITMAP(subpage, fs_info, uptodate, &uptodate_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, &dirty_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &checked_bitmap); - spin_unlock_irqrestore(&subpage->lock, flags); - - dump_page(folio_page(folio, 0), "btrfs subpage dump"); + ASSERT(blocks_per_folio > 1); + bfs = folio_get_private(folio); + + spin_lock_irqsave(&bfs->lock, flags); + GET_SUBPAGE_BITMAP(fs_info, folio, uptodate, &uptodate_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, dirty, &dirty_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, writeback, &writeback_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, ordered, &ordered_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, checked, &checked_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, locked, &locked_bitmap); + spin_unlock_irqrestore(&bfs->lock, flags); + + dump_page(folio_page(folio, 0), "btrfs folio state dump"); btrfs_warn(fs_info, -"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", +"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", start, len, folio_pos(folio), - sectors_per_page, &uptodate_bitmap, - sectors_per_page, &dirty_bitmap, - sectors_per_page, &writeback_bitmap, - sectors_per_page, &ordered_bitmap, - sectors_per_page, &checked_bitmap); + blocks_per_folio, &uptodate_bitmap, + blocks_per_folio, &dirty_bitmap, + blocks_per_folio, &locked_bitmap, + blocks_per_folio, &writeback_bitmap, + blocks_per_folio, &ordered_bitmap, + blocks_per_folio, &checked_bitmap); } void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, struct folio *folio, unsigned long *ret_bitmap) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); - ASSERT(fs_info->sectors_per_page > 1); - subpage = folio_get_private(folio); + ASSERT(btrfs_blocks_per_folio(fs_info, folio) > 1); + bfs = folio_get_private(folio); - spin_lock_irqsave(&subpage->lock, flags); - GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, ret_bitmap); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_lock_irqsave(&bfs->lock, flags); + GET_SUBPAGE_BITMAP(fs_info, folio, dirty, ret_bitmap); + spin_unlock_irqrestore(&bfs->lock, flags); } diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 428fa9389fd4..ee0710eb13fd 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -6,10 +6,11 @@ #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/sizes.h> +#include "btrfs_inode.h" +#include "fs.h" struct address_space; struct folio; -struct btrfs_fs_info; /* * Extra info for subpapge bitmap. @@ -31,9 +32,31 @@ struct btrfs_fs_info; enum { btrfs_bitmap_nr_uptodate = 0, btrfs_bitmap_nr_dirty, + + /* + * This can be changed to atomic eventually. But this change will rely + * on the async delalloc range rework for locked bitmap. As async + * delalloc can unlock its range and mark blocks writeback at random + * timing. + */ btrfs_bitmap_nr_writeback, + + /* + * The ordered and checked flags are for COW fixup, already marked + * deprecated, and will be removed eventually. + */ btrfs_bitmap_nr_ordered, btrfs_bitmap_nr_checked, + + /* + * The locked bit is for async delalloc range (compression), currently + * async extent is queued with the range locked, until the compression + * is done. + * So an async extent can unlock the range at any random timing. + * + * This will need a rework on the async extent lifespan (mark writeback + * and do compression) before deprecating this flag. + */ btrfs_bitmap_nr_locked, btrfs_bitmap_nr_max }; @@ -42,7 +65,7 @@ enum { * Structure to trace status of each sector inside a page, attached to * page::private for both data and metadata inodes. */ -struct btrfs_subpage { +struct btrfs_folio_state { /* Common members for both data and metadata pages */ spinlock_t lock; union { @@ -50,7 +73,7 @@ struct btrfs_subpage { * Structures only used by metadata * * @eb_refs should only be operated under private_lock, as it - * manages whether the subpage can be detached. + * manages whether the btrfs_folio_state can be detached. */ atomic_t eb_refs; @@ -64,29 +87,44 @@ struct btrfs_subpage { unsigned long bitmaps[]; }; -enum btrfs_subpage_type { +enum btrfs_folio_type { BTRFS_SUBPAGE_METADATA, BTRFS_SUBPAGE_DATA, }; -#if PAGE_SIZE > SZ_4K -bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping); -#else +/* + * Subpage support for metadata is more complex, as we can have dummy extent + * buffers, where folios have no mapping to determine the owning inode. + * + * Thankfully we only need to check if node size is smaller than page size. + * Even with larger folio support, we will only allocate a folio as large as + * node size. + * Thus if nodesize < PAGE_SIZE, we know metadata needs need to subpage routine. + */ +static inline bool btrfs_meta_is_subpage(const struct btrfs_fs_info *fs_info) +{ + return fs_info->nodesize < PAGE_SIZE; +} static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, - struct address_space *mapping) + struct folio *folio) { - return false; + if (folio->mapping && folio->mapping->host) + ASSERT(is_data_inode(BTRFS_I(folio->mapping->host))); + return fs_info->sectorsize < folio_size(folio); } -#endif -int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, - struct folio *folio, enum btrfs_subpage_type type); -void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio); +int btrfs_attach_folio_state(const struct btrfs_fs_info *fs_info, + struct folio *folio, enum btrfs_folio_type type); +void btrfs_detach_folio_state(const struct btrfs_fs_info *fs_info, struct folio *folio, + enum btrfs_folio_type type); /* Allocate additional data where page represents more than one sector */ -struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, - enum btrfs_subpage_type type); -void btrfs_free_subpage(struct btrfs_subpage *subpage); +struct btrfs_folio_state *btrfs_alloc_folio_state(const struct btrfs_fs_info *fs_info, + size_t fsize, enum btrfs_folio_type type); +static inline void btrfs_free_folio_state(struct btrfs_folio_state *bfs) +{ + kfree(bfs); +} void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); @@ -110,6 +148,13 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, * btrfs_folio_clamp_*() are similar to btrfs_folio_*(), except the range doesn't * need to be inside the page. Those functions will truncate the range * automatically. + * + * Both btrfs_folio_*() and btrfs_folio_clamp_*() are for data folios. + * + * For metadata, one should use btrfs_meta_folio_*() helpers instead, and there + * is no clamp version for metadata helpers, as we either go subpage + * (nodesize < PAGE_SIZE) or go regular folio helpers (nodesize >= PAGE_SIZE, + * and our folio is never larger than nodesize). */ #define DECLARE_BTRFS_SUBPAGE_OPS(name) \ void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \ @@ -129,7 +174,10 @@ void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len); \ bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ - struct folio *folio, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); \ +void btrfs_meta_folio_set_##name(struct folio *folio, const struct extent_buffer *eb); \ +void btrfs_meta_folio_clear_##name(struct folio *folio, const struct extent_buffer *eb); \ +bool btrfs_meta_folio_test_##name(struct folio *folio, const struct extent_buffer *eb); DECLARE_BTRFS_SUBPAGE_OPS(uptodate); DECLARE_BTRFS_SUBPAGE_OPS(dirty); @@ -137,11 +185,25 @@ DECLARE_BTRFS_SUBPAGE_OPS(writeback); DECLARE_BTRFS_SUBPAGE_OPS(ordered); DECLARE_BTRFS_SUBPAGE_OPS(checked); +/* + * Helper for error cleanup, where a folio will have its dirty flag cleared, + * with writeback started and finished. + */ +static inline void btrfs_folio_clamp_finish_io(struct btrfs_fs_info *fs_info, + struct folio *locked_folio, + u64 start, u32 len) +{ + btrfs_folio_clamp_clear_dirty(fs_info, locked_folio, start, len); + btrfs_folio_clamp_set_writeback(fs_info, locked_folio, start, len); + btrfs_folio_clamp_clear_writeback(fs_info, locked_folio, start, len); +} + bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); +bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct extent_buffer *eb); void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, struct folio *folio, unsigned long *ret_bitmap); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 97a85d180b61..68e35a3700ff 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -84,7 +84,7 @@ struct btrfs_fs_context { u32 thread_pool_size; unsigned long long mount_opt; unsigned long compress_type:4; - unsigned int compress_level; + int compress_level; refcount_t refs; }; @@ -125,7 +125,6 @@ enum { /* Rescue options */ Opt_rescue, Opt_usebackuproot, - Opt_nologreplay, /* Debugging options */ Opt_enospc_debug, @@ -246,8 +245,6 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = { /* Rescue options. */ fsparam_enum("rescue", Opt_rescue, btrfs_parameter_rescue), - /* Deprecated, with alias rescue=nologreplay */ - __fsparam(NULL, "nologreplay", Opt_nologreplay, fs_param_deprecated, NULL), /* Deprecated, with alias rescue=usebackuproot */ __fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL), /* For compatibility only, alias for "rescue=nologreplay". */ @@ -264,10 +261,65 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = { {} }; -/* No support for restricting writes to btrfs devices yet... */ -static inline blk_mode_t btrfs_open_mode(struct fs_context *fc) +static bool btrfs_match_compress_type(const char *string, const char *type, bool may_have_level) { - return sb_open_mode(fc->sb_flags) & ~BLK_OPEN_RESTRICT_WRITES; + const int len = strlen(type); + + return (strncmp(string, type, len) == 0) && + ((may_have_level && string[len] == ':') || string[len] == '\0'); +} + +static int btrfs_parse_compress(struct btrfs_fs_context *ctx, + const struct fs_parameter *param, int opt) +{ + const char *string = param->string; + + /* + * Provide the same semantics as older kernels that don't use fs + * context, specifying the "compress" option clears "force-compress" + * without the need to pass "compress-force=[no|none]" before + * specifying "compress". + */ + if (opt != Opt_compress_force && opt != Opt_compress_force_type) + btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); + + if (opt == Opt_compress || opt == Opt_compress_force) { + ctx->compress_type = BTRFS_COMPRESS_ZLIB; + ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (btrfs_match_compress_type(string, "zlib", true)) { + ctx->compress_type = BTRFS_COMPRESS_ZLIB; + ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, + string + 4); + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (btrfs_match_compress_type(string, "lzo", false)) { + ctx->compress_type = BTRFS_COMPRESS_LZO; + ctx->compress_level = 0; + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (btrfs_match_compress_type(string, "zstd", true)) { + ctx->compress_type = BTRFS_COMPRESS_ZSTD; + ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, + string + 4); + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (btrfs_match_compress_type(string, "no", false) || + btrfs_match_compress_type(string, "none", false)) { + ctx->compress_level = 0; + ctx->compress_type = 0; + btrfs_clear_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); + } else { + btrfs_err(NULL, "unrecognized compression value %s", string); + return -EINVAL; + } + return 0; } static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) @@ -306,10 +358,9 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) break; case Opt_device: { struct btrfs_device *device; - blk_mode_t mode = btrfs_open_mode(fc); mutex_lock(&uuid_mutex); - device = btrfs_scan_one_device(param->string, mode, false); + device = btrfs_scan_one_device(param->string, false); mutex_unlock(&uuid_mutex); if (IS_ERR(device)) return PTR_ERR(device); @@ -339,53 +390,8 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) fallthrough; case Opt_compress: case Opt_compress_type: - /* - * Provide the same semantics as older kernels that don't use fs - * context, specifying the "compress" option clears - * "force-compress" without the need to pass - * "compress-force=[no|none]" before specifying "compress". - */ - if (opt != Opt_compress_force && opt != Opt_compress_force_type) - btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); - - if (opt == Opt_compress || opt == Opt_compress_force) { - ctx->compress_type = BTRFS_COMPRESS_ZLIB; - ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; - btrfs_set_opt(ctx->mount_opt, COMPRESS); - btrfs_clear_opt(ctx->mount_opt, NODATACOW); - btrfs_clear_opt(ctx->mount_opt, NODATASUM); - } else if (strncmp(param->string, "zlib", 4) == 0) { - ctx->compress_type = BTRFS_COMPRESS_ZLIB; - ctx->compress_level = - btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, - param->string + 4); - btrfs_set_opt(ctx->mount_opt, COMPRESS); - btrfs_clear_opt(ctx->mount_opt, NODATACOW); - btrfs_clear_opt(ctx->mount_opt, NODATASUM); - } else if (strncmp(param->string, "lzo", 3) == 0) { - ctx->compress_type = BTRFS_COMPRESS_LZO; - ctx->compress_level = 0; - btrfs_set_opt(ctx->mount_opt, COMPRESS); - btrfs_clear_opt(ctx->mount_opt, NODATACOW); - btrfs_clear_opt(ctx->mount_opt, NODATASUM); - } else if (strncmp(param->string, "zstd", 4) == 0) { - ctx->compress_type = BTRFS_COMPRESS_ZSTD; - ctx->compress_level = - btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, - param->string + 4); - btrfs_set_opt(ctx->mount_opt, COMPRESS); - btrfs_clear_opt(ctx->mount_opt, NODATACOW); - btrfs_clear_opt(ctx->mount_opt, NODATASUM); - } else if (strncmp(param->string, "no", 2) == 0) { - ctx->compress_level = 0; - ctx->compress_type = 0; - btrfs_clear_opt(ctx->mount_opt, COMPRESS); - btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); - } else { - btrfs_err(NULL, "unrecognized compression value %s", - param->string); + if (btrfs_parse_compress(ctx, param, opt)) return -EINVAL; - } break; case Opt_ssd: if (result.negated) { @@ -449,11 +455,6 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) else btrfs_clear_opt(ctx->mount_opt, NOTREELOG); break; - case Opt_nologreplay: - btrfs_warn(NULL, - "'nologreplay' is deprecated, use 'rescue=nologreplay' instead"); - btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); - break; case Opt_norecovery: btrfs_info(NULL, "'norecovery' is for compatibility only, recommended to use 'rescue=nologreplay'"); @@ -569,6 +570,10 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) break; case Opt_commit_interval: ctx->commit_interval = result.uint_32; + if (ctx->commit_interval > BTRFS_WARNING_COMMIT_INTERVAL) { + btrfs_warn(NULL, "excessive commit interval %u, use with care", + ctx->commit_interval); + } if (ctx->commit_interval == 0) ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; break; @@ -947,44 +952,44 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec static int btrfs_fill_super(struct super_block *sb, struct btrfs_fs_devices *fs_devices) { - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_fs_info *fs_info = btrfs_sb(sb); - int err; + int ret; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_magic = BTRFS_SUPER_MAGIC; sb->s_op = &btrfs_super_ops; - sb->s_d_op = &btrfs_dentry_operations; + set_default_d_op(sb, &btrfs_dentry_operations); sb->s_export_op = &btrfs_export_ops; #ifdef CONFIG_FS_VERITY sb->s_vop = &btrfs_verityops; #endif sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; - sb->s_iflags |= SB_I_CGROUPWB; + sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM; - err = super_setup_bdi(sb); - if (err) { + ret = super_setup_bdi(sb); + if (ret) { btrfs_err(fs_info, "super_setup_bdi failed"); - return err; + return ret; } - err = open_ctree(sb, fs_devices); - if (err) { - btrfs_err(fs_info, "open_ctree failed"); - return err; + ret = open_ctree(sb, fs_devices); + if (ret) { + btrfs_err(fs_info, "open_ctree failed: %d", ret); + return ret; } inode = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root); if (IS_ERR(inode)) { - err = PTR_ERR(inode); - btrfs_handle_fs_error(fs_info, err, NULL); + ret = PTR_ERR(inode); + btrfs_handle_fs_error(fs_info, ret, NULL); goto fail_close; } - sb->s_root = d_make_root(inode); + sb->s_root = d_make_root(&inode->vfs_inode); if (!sb->s_root) { - err = -ENOMEM; + ret = -ENOMEM; goto fail_close; } @@ -993,7 +998,7 @@ static int btrfs_fill_super(struct super_block *sb, fail_close: close_ctree(fs_info); - return err; + return ret; } int btrfs_sync_fs(struct super_block *sb, int wait) @@ -1139,8 +1144,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) subvol_name = btrfs_get_subvol_name_from_objectid(info, btrfs_root_id(BTRFS_I(d_inode(dentry))->root)); if (!IS_ERR(subvol_name)) { - seq_puts(seq, ",subvol="); - seq_escape(seq, subvol_name, " \t\n\\"); + seq_show_option(seq, "subvol", subvol_name); kfree(subvol_name); } return 0; @@ -1149,11 +1153,11 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) /* * subvolumes are identified by ino 256 */ -static inline int is_subvolume_inode(struct inode *inode) +static inline bool is_subvolume_inode(struct inode *inode) { if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) - return 1; - return 0; + return true; + return false; } static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, @@ -1831,10 +1835,9 @@ static int btrfs_get_tree_super(struct fs_context *fc) struct btrfs_fs_info *fs_info = fc->s_fs_info; struct btrfs_fs_context *ctx = fc->fs_private; struct btrfs_fs_devices *fs_devices = NULL; - struct block_device *bdev; struct btrfs_device *device; struct super_block *sb; - blk_mode_t mode = btrfs_open_mode(fc); + blk_mode_t mode = sb_open_mode(fc->sb_flags); int ret; btrfs_ctx_to_info(fs_info, ctx); @@ -1844,69 +1847,104 @@ static int btrfs_get_tree_super(struct fs_context *fc) * With 'true' passed to btrfs_scan_one_device() (mount time) we expect * either a valid device or an error. */ - device = btrfs_scan_one_device(fc->source, mode, true); + device = btrfs_scan_one_device(fc->source, true); ASSERT(device != NULL); if (IS_ERR(device)) { mutex_unlock(&uuid_mutex); return PTR_ERR(device); } - fs_devices = device->fs_devices; + /* + * We cannot hold uuid_mutex calling sget_fc(), it will lead to a + * locking order reversal with s_umount. + * + * So here we increase the holding number of fs_devices, this will ensure + * the fs_devices itself won't be freed. + */ + btrfs_fs_devices_inc_holding(fs_devices); fs_info->fs_devices = fs_devices; - - ret = btrfs_open_devices(fs_devices, mode, &btrfs_fs_type); mutex_unlock(&uuid_mutex); - if (ret) - return ret; - - if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) { - ret = -EACCES; - goto error; - } - bdev = fs_devices->latest_dev->bdev; - /* - * From now on the error handling is not straightforward. - * - * If successful, this will transfer the fs_info into the super block, - * and fc->s_fs_info will be NULL. However if there's an existing - * super, we'll still have fc->s_fs_info populated. If we error - * completely out it'll be cleaned up when we drop the fs_context, - * otherwise it's tied to the lifetime of the super_block. - */ sb = sget_fc(fc, btrfs_fc_test_super, set_anon_super_fc); if (IS_ERR(sb)) { - ret = PTR_ERR(sb); - goto error; + mutex_lock(&uuid_mutex); + btrfs_fs_devices_dec_holding(fs_devices); + /* + * Since the fs_devices is not opened, it can be freed at any + * time after unlocking uuid_mutex. We need to avoid double + * free through put_fs_context()->btrfs_free_fs_info(). + * So here we reset fs_info->fs_devices to NULL, and let the + * regular fs_devices reclaim path to handle it. + * + * This applies to all later branches where no fs_devices is + * opened. + */ + fs_info->fs_devices = NULL; + mutex_unlock(&uuid_mutex); + return PTR_ERR(sb); } set_device_specific_options(fs_info); if (sb->s_root) { - btrfs_close_devices(fs_devices); - if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) - ret = -EBUSY; + /* + * Not the first mount of the fs thus got an existing super block. + * Will reuse the returned super block, fs_info and fs_devices. + * + * fc->s_fs_info is not touched and will be later freed by + * put_fs_context() through btrfs_free_fs_context(). + */ + ASSERT(fc->s_fs_info == fs_info); + + mutex_lock(&uuid_mutex); + btrfs_fs_devices_dec_holding(fs_devices); + fs_info->fs_devices = NULL; + mutex_unlock(&uuid_mutex); + /* + * At this stage we may have RO flag mismatch between + * fc->sb_flags and sb->s_flags. Caller should detect such + * mismatch and reconfigure with sb->s_umount rwsem held if + * needed. + */ } else { + struct block_device *bdev; + + /* + * The first mount of the fs thus a new superblock, fc->s_fs_info + * must be NULL, and the ownership of our fs_info and fs_devices is + * transferred to the super block. + */ + ASSERT(fc->s_fs_info == NULL); + + mutex_lock(&uuid_mutex); + btrfs_fs_devices_dec_holding(fs_devices); + ret = btrfs_open_devices(fs_devices, mode, sb); + if (ret < 0) + fs_info->fs_devices = NULL; + mutex_unlock(&uuid_mutex); + if (ret < 0) { + deactivate_locked_super(sb); + return ret; + } + if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) { + deactivate_locked_super(sb); + return -EACCES; + } + bdev = fs_devices->latest_dev->bdev; snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id); - btrfs_sb(sb)->bdev_holder = &btrfs_fs_type; ret = btrfs_fill_super(sb, fs_devices); - } - - if (ret) { - deactivate_locked_super(sb); - return ret; + if (ret) { + deactivate_locked_super(sb); + return ret; + } } btrfs_clear_oneshot_options(fs_info); fc->root = dget(sb->s_root); return 0; - -error: - btrfs_close_devices(fs_devices); - return ret; } /* @@ -1982,39 +2020,14 @@ error: * btrfs or not, setting the whole super block RO. To make per-subvolume mounting * work with different options work we need to keep backward compatibility. */ -static struct vfsmount *btrfs_reconfigure_for_mount(struct fs_context *fc) +static int btrfs_reconfigure_for_mount(struct fs_context *fc) { - struct vfsmount *mnt; - int ret; - const bool ro2rw = !(fc->sb_flags & SB_RDONLY); - - /* - * We got an EBUSY because our SB_RDONLY flag didn't match the existing - * super block, so invert our setting here and retry the mount so we - * can get our vfsmount. - */ - if (ro2rw) - fc->sb_flags |= SB_RDONLY; - else - fc->sb_flags &= ~SB_RDONLY; - - mnt = fc_mount(fc); - if (IS_ERR(mnt)) - return mnt; + int ret = 0; - if (!ro2rw) - return mnt; + if (!(fc->sb_flags & SB_RDONLY) && (fc->root->d_sb->s_flags & SB_RDONLY)) + ret = btrfs_reconfigure(fc); - /* We need to convert to rw, call reconfigure. */ - fc->sb_flags &= ~SB_RDONLY; - down_write(&mnt->mnt_sb->s_umount); - ret = btrfs_reconfigure(fc); - up_write(&mnt->mnt_sb->s_umount); - if (ret) { - mntput(mnt); - return ERR_PTR(ret); - } - return mnt; + return ret; } static int btrfs_get_tree_subvol(struct fs_context *fc) @@ -2024,6 +2037,7 @@ static int btrfs_get_tree_subvol(struct fs_context *fc) struct fs_context *dup_fc; struct dentry *dentry; struct vfsmount *mnt; + int ret = 0; /* * Setup a dummy root and fs_info for test/set super. This is because @@ -2057,17 +2071,15 @@ static int btrfs_get_tree_subvol(struct fs_context *fc) */ dup_fc->s_fs_info = fs_info; - /* - * We'll do the security settings in our btrfs_get_tree_super() mount - * loop, they were duplicated into dup_fc, we can drop the originals - * here. - */ - security_free_mnt_opts(&fc->security); - fc->security = NULL; + ret = btrfs_get_tree_super(dup_fc); + if (ret) + goto error; - mnt = fc_mount(dup_fc); - if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) - mnt = btrfs_reconfigure_for_mount(dup_fc); + ret = btrfs_reconfigure_for_mount(dup_fc); + up_write(&dup_fc->root->d_sb->s_umount); + if (ret) + goto error; + mnt = vfs_create_mount(dup_fc); put_fs_context(dup_fc); if (IS_ERR(mnt)) return PTR_ERR(mnt); @@ -2084,25 +2096,15 @@ static int btrfs_get_tree_subvol(struct fs_context *fc) fc->root = dentry; return 0; +error: + put_fs_context(dup_fc); + return ret; } static int btrfs_get_tree(struct fs_context *fc) { - /* - * Since we use mount_subtree to mount the default/specified subvol, we - * have to do mounts in two steps. - * - * First pass through we call btrfs_get_tree_subvol(), this is just a - * wrapper around fc_mount() to call back into here again, and this time - * we'll call btrfs_get_tree_super(). This will do the open_ctree() and - * everything to open the devices and file system. Then we return back - * with a fully constructed vfsmount in btrfs_get_tree_subvol(), and - * from there we can do our mount_subvol() call, which will lookup - * whichever subvol we're mounting and setup this fc with the - * appropriate dentry for the subvol. - */ - if (fc->s_fs_info) - return btrfs_get_tree_super(fc); + ASSERT(fc->s_fs_info == NULL); + return btrfs_get_tree_subvol(fc); } @@ -2234,7 +2236,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, * Scanning outside of mount can return NULL which would turn * into 0 error code. */ - device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false); + device = btrfs_scan_one_device(vol->name, false); ret = PTR_ERR_OR_ZERO(device); mutex_unlock(&uuid_mutex); break; @@ -2252,7 +2254,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, * Scanning outside of mount can return NULL which would turn * into 0 error code. */ - device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false); + device = btrfs_scan_one_device(vol->name, false); if (IS_ERR_OR_NULL(device)) { mutex_unlock(&uuid_mutex); if (IS_ERR(device)) @@ -2305,7 +2307,7 @@ static int check_dev_super(struct btrfs_device *dev) return 0; /* Only need to check the primary super block. */ - sb = btrfs_read_dev_one_super(dev->bdev, 0, true); + sb = btrfs_read_disk_super(dev->bdev, 0, true); if (IS_ERR(sb)) return PTR_ERR(sb); @@ -2458,6 +2460,9 @@ static __cold void btrfs_interface_exit(void) static int __init btrfs_print_mod_info(void) { static const char options[] = "" +#ifdef CONFIG_BTRFS_EXPERIMENTAL + ", experimental=on" +#endif #ifdef CONFIG_BTRFS_DEBUG ", debug=on" #endif @@ -2478,7 +2483,17 @@ static int __init btrfs_print_mod_info(void) ", fsverity=no" #endif ; + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + if (btrfs_get_mod_read_policy() == NULL) + pr_info("Btrfs loaded%s\n", options); + else + pr_info("Btrfs loaded%s, read_policy=%s\n", + options, btrfs_get_mod_read_policy()); +#else pr_info("Btrfs loaded%s\n", options); +#endif + return 0; } @@ -2525,8 +2540,8 @@ static const struct init_sequence mod_init_seq[] = { .init_func = btrfs_free_space_init, .exit_func = btrfs_free_space_exit, }, { - .init_func = extent_state_init_cachep, - .exit_func = extent_state_free_cachep, + .init_func = btrfs_extent_state_init_cachep, + .exit_func = btrfs_extent_state_free_cachep, }, { .init_func = extent_buffer_init_cachep, .exit_func = extent_buffer_free_cachep, @@ -2534,8 +2549,13 @@ static const struct init_sequence mod_init_seq[] = { .init_func = btrfs_bioset_init, .exit_func = btrfs_bioset_exit, }, { - .init_func = extent_map_init, - .exit_func = extent_map_exit, + .init_func = btrfs_extent_map_init, + .exit_func = btrfs_extent_map_exit, +#ifdef CONFIG_BTRFS_EXPERIMENTAL + }, { + .init_func = btrfs_read_policy_init, + .exit_func = NULL, +#endif }, { .init_func = ordered_data_init, .exit_func = ordered_data_exit, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index b843308e2bc6..9d398f7a36ad 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -160,8 +160,7 @@ static int can_modify_feature(struct btrfs_feature_attr *fa) clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR; break; default: - pr_warn("btrfs: sysfs: unknown feature set %d\n", - fa->feature_set); + btrfs_warn(NULL, "sysfs: unknown feature set %d", fa->feature_set); return 0; } @@ -295,7 +294,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(simple_quota, SIMPLE_QUOTA); #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); #endif -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL /* Remove once support for extent tree v2 is feature complete */ BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); /* Remove once support for raid stripe tree is feature complete. */ @@ -329,7 +328,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_PTR(zoned), #endif -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL BTRFS_FEAT_ATTR_PTR(extent_tree_v2), BTRFS_FEAT_ATTR_PTR(raid_stripe_tree), #endif @@ -411,7 +410,8 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, { ssize_t ret = 0; - /* An artificial limit to only support 4K and PAGE_SIZE */ + if (BTRFS_MIN_BLOCKSIZE != SZ_4K && BTRFS_MIN_BLOCKSIZE != PAGE_SIZE) + ret += sysfs_emit_at(buf, ret, "%u ", BTRFS_MIN_BLOCKSIZE); if (PAGE_SIZE > SZ_4K) ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K); ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE); @@ -1118,7 +1118,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize); + return sysfs_emit(buf, "%u\n", fs_info->nodesize); } BTRFS_ATTR(, nodesize, btrfs_nodesize_show); @@ -1128,7 +1128,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); + return sysfs_emit(buf, "%u\n", fs_info->sectorsize); } BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); @@ -1137,13 +1137,21 @@ static ssize_t btrfs_commit_stats_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); + u64 now = ktime_get_ns(); + u64 start_time = fs_info->commit_stats.critical_section_start_time; + u64 pending = 0; + + if (start_time) + pending = now - start_time; return sysfs_emit(buf, "commits %llu\n" + "cur_commit_ms %llu\n" "last_commit_ms %llu\n" "max_commit_ms %llu\n" "total_commit_ms %llu\n", fs_info->commit_stats.commit_count, + div_u64(pending, NSEC_PER_MSEC), div_u64(fs_info->commit_stats.last_commit_dur, NSEC_PER_MSEC), div_u64(fs_info->commit_stats.max_commit_dur, NSEC_PER_MSEC), div_u64(fs_info->commit_stats.total_commit_dur, NSEC_PER_MSEC)); @@ -1180,7 +1188,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); + return sysfs_emit(buf, "%u\n", fs_info->sectorsize); } BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); @@ -1201,7 +1209,7 @@ static ssize_t quota_override_store(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); unsigned long knob; - int err; + int ret; if (!fs_info) return -EPERM; @@ -1209,9 +1217,9 @@ static ssize_t quota_override_store(struct kobject *kobj, if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - err = kstrtoul(buf, 10, &knob); - if (err) - return err; + ret = kstrtoul(buf, 10, &knob); + if (ret) + return ret; if (knob > 1) return -EINVAL; @@ -1305,7 +1313,74 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj, } BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); -static const char * const btrfs_read_policy_name[] = { "pid" }; +static const char *btrfs_read_policy_name[] = { + "pid", +#ifdef CONFIG_BTRFS_EXPERIMENTAL + "round-robin", + "devid", +#endif +}; + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + +/* Global module configuration parameters. */ +static char *read_policy; +char *btrfs_get_mod_read_policy(void) +{ + return read_policy; +} + +/* Set perms to 0, disable /sys/module/btrfs/parameter/read_policy interface. */ +module_param(read_policy, charp, 0); +MODULE_PARM_DESC(read_policy, +"Global read policy: pid (default), round-robin[:<min_contig_read>], devid[:<devid>]"); +#endif + +int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) +{ + char param[32]; + char __maybe_unused *value_str; + + if (!str || strlen(str) == 0) + return 0; + + strscpy(param, str); + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* Separate value from input in policy:value format. */ + value_str = strchr(param, ':'); + if (value_str) { + char *retptr; + + *value_str = 0; + value_str++; + if (!value_ret) + return -EINVAL; + + *value_ret = memparse(value_str, &retptr); + /* There could be any trailing typos after the value. */ + retptr = skip_spaces(retptr); + if (*retptr != 0 || *value_ret <= 0) + return -EINVAL; + } +#endif + + return sysfs_match_string(btrfs_read_policy_name, param); +} + +#ifdef CONFIG_BTRFS_EXPERIMENTAL +int __init btrfs_read_policy_init(void) +{ + s64 value; + + if (btrfs_read_policy_to_enum(read_policy, &value) == -EINVAL) { + btrfs_err(NULL, "invalid read policy or value %s", read_policy); + return -EINVAL; + } + + return 0; +} +#endif static ssize_t btrfs_read_policy_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -1316,14 +1391,25 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, int i; for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { - if (policy == i) - ret += sysfs_emit_at(buf, ret, "%s[%s]", - (ret == 0 ? "" : " "), - btrfs_read_policy_name[i]); - else - ret += sysfs_emit_at(buf, ret, "%s%s", - (ret == 0 ? "" : " "), - btrfs_read_policy_name[i]); + if (ret != 0) + ret += sysfs_emit_at(buf, ret, " "); + + if (i == policy) + ret += sysfs_emit_at(buf, ret, "["); + + ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]); + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + if (i == BTRFS_READ_POLICY_RR) + ret += sysfs_emit_at(buf, ret, ":%u", + READ_ONCE(fs_devices->rr_min_contig_read)); + + if (i == BTRFS_READ_POLICY_DEVID) + ret += sysfs_emit_at(buf, ret, ":%llu", + READ_ONCE(fs_devices->read_devid)); +#endif + if (i == policy) + ret += sysfs_emit_at(buf, ret, "]"); } ret += sysfs_emit_at(buf, ret, "\n"); @@ -1336,21 +1422,80 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, const char *buf, size_t len) { struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); - int i; + int index; + s64 value = -1; - for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { - if (sysfs_streq(buf, btrfs_read_policy_name[i])) { - if (i != READ_ONCE(fs_devices->read_policy)) { - WRITE_ONCE(fs_devices->read_policy, i); - btrfs_info(fs_devices->fs_info, - "read policy set to '%s'", - btrfs_read_policy_name[i]); + index = btrfs_read_policy_to_enum(buf, &value); + if (index < 0) + return -EINVAL; + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* If moving from RR then disable collecting fs stats. */ + if (fs_devices->read_policy == BTRFS_READ_POLICY_RR && index != BTRFS_READ_POLICY_RR) + fs_devices->collect_fs_stats = false; + + if (index == BTRFS_READ_POLICY_RR) { + if (value != -1) { + const u32 sectorsize = fs_devices->fs_info->sectorsize; + + if (!IS_ALIGNED(value, sectorsize)) { + u64 temp_value = round_up(value, sectorsize); + + btrfs_debug(fs_devices->fs_info, +"read_policy: min contig read %lld should be multiple of sectorsize %u, rounded to %llu", + value, sectorsize, temp_value); + value = temp_value; } - return len; + } else { + value = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; + } + + if (index != READ_ONCE(fs_devices->read_policy) || + value != READ_ONCE(fs_devices->rr_min_contig_read)) { + WRITE_ONCE(fs_devices->read_policy, index); + WRITE_ONCE(fs_devices->rr_min_contig_read, value); + + btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'", + btrfs_read_policy_name[index], value); } + + fs_devices->collect_fs_stats = true; + + return len; } - return -EINVAL; + if (index == BTRFS_READ_POLICY_DEVID) { + if (value != -1) { + BTRFS_DEV_LOOKUP_ARGS(args); + + /* Validate input devid. */ + args.devid = value; + if (btrfs_find_device(fs_devices, &args) == NULL) + return -EINVAL; + } else { + /* Set default devid to the devid of the latest device. */ + value = fs_devices->latest_dev->devid; + } + + if (index != READ_ONCE(fs_devices->read_policy) || + value != READ_ONCE(fs_devices->read_devid)) { + WRITE_ONCE(fs_devices->read_policy, index); + WRITE_ONCE(fs_devices->read_devid, value); + + btrfs_info(fs_devices->fs_info, "read policy set to '%s:%llu'", + btrfs_read_policy_name[index], value); + } + + return len; + } +#endif + if (index != READ_ONCE(fs_devices->read_policy)) { + WRITE_ONCE(fs_devices->read_policy, index); + btrfs_info(fs_devices->fs_info, "read policy set to '%s'", + btrfs_read_policy_name[index]); + } + + return len; } BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store); @@ -1792,16 +1937,35 @@ void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info) kobject_put(&space_info->kobj); } -static const char *alloc_name(u64 flags) +static const char *alloc_name(struct btrfs_space_info *space_info) { + u64 flags = space_info->flags; + switch (flags) { case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: return "mixed"; case BTRFS_BLOCK_GROUP_METADATA: - return "metadata"; + switch (space_info->subgroup_id) { + case BTRFS_SUB_GROUP_PRIMARY: + return "metadata"; + case BTRFS_SUB_GROUP_TREELOG: + return "metadata-treelog"; + default: + WARN_ON_ONCE(1); + return "metadata (unknown sub-group)"; + } case BTRFS_BLOCK_GROUP_DATA: - return "data"; + switch (space_info->subgroup_id) { + case BTRFS_SUB_GROUP_PRIMARY: + return "data"; + case BTRFS_SUB_GROUP_DATA_RELOC: + return "data-reloc"; + default: + WARN_ON_ONCE(1); + return "data (unknown sub-group)"; + } case BTRFS_BLOCK_GROUP_SYSTEM: + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY); return "system"; default: WARN_ON(1); @@ -1820,7 +1984,7 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, fs_info->space_info_kobj, "%s", - alloc_name(space_info->flags)); + alloc_name(space_info)); if (ret) { kobject_put(&space_info->kobj); return ret; @@ -2082,7 +2246,7 @@ void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action) ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); if (ret) - pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", + btrfs_warn(NULL, "sending event %d to kobject: '%s' (%p): failed", action, kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), &disk_to_dev(bdev->bd_disk)->kobj); } @@ -2125,15 +2289,15 @@ static struct kset *btrfs_kset; */ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs) { - int error; + int ret; init_completion(&fs_devs->kobj_unregister); fs_devs->fsid_kobj.kset = btrfs_kset; - error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL, - "%pU", fs_devs->fsid); - if (error) { + ret = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL, + "%pU", fs_devs->fsid); + if (ret) { kobject_put(&fs_devs->fsid_kobj); - return error; + return ret; } fs_devs->devices_kobj = kobject_create_and_add("devices", @@ -2159,71 +2323,70 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs) int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) { - int error; + int ret; struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; struct kobject *fsid_kobj = &fs_devs->fsid_kobj; - error = btrfs_sysfs_add_fs_devices(fs_devs); - if (error) - return error; + ret = btrfs_sysfs_add_fs_devices(fs_devs); + if (ret) + return ret; - error = sysfs_create_files(fsid_kobj, btrfs_attrs); - if (error) { + ret = sysfs_create_files(fsid_kobj, btrfs_attrs); + if (ret) { btrfs_sysfs_remove_fs_devices(fs_devs); - return error; + return ret; } - error = sysfs_create_group(fsid_kobj, - &btrfs_feature_attr_group); - if (error) + ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group); + if (ret) goto failure; #ifdef CONFIG_BTRFS_DEBUG fs_info->debug_kobj = kobject_create_and_add("debug", fsid_kobj); if (!fs_info->debug_kobj) { - error = -ENOMEM; + ret = -ENOMEM; goto failure; } - error = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs); - if (error) + ret = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs); + if (ret) goto failure; #endif /* Discard directory */ fs_info->discard_kobj = kobject_create_and_add("discard", fsid_kobj); if (!fs_info->discard_kobj) { - error = -ENOMEM; + ret = -ENOMEM; goto failure; } - error = sysfs_create_files(fs_info->discard_kobj, discard_attrs); - if (error) + ret = sysfs_create_files(fs_info->discard_kobj, discard_attrs); + if (ret) goto failure; - error = addrm_unknown_feature_attrs(fs_info, true); - if (error) + ret = addrm_unknown_feature_attrs(fs_info, true); + if (ret) goto failure; - error = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi"); - if (error) + ret = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi"); + if (ret) goto failure; fs_info->space_info_kobj = kobject_create_and_add("allocation", fsid_kobj); if (!fs_info->space_info_kobj) { - error = -ENOMEM; + ret = -ENOMEM; goto failure; } - error = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs); - if (error) + ret = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs); + if (ret) goto failure; return 0; failure: btrfs_sysfs_remove_mounted(fs_info); - return error; + return ret; } static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj, diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index e6a284c59809..0f94ae923210 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -7,6 +7,7 @@ #include <linux/compiler_types.h> #include <linux/kobject.h> +struct block_device; struct btrfs_fs_info; struct btrfs_device; struct btrfs_fs_devices; @@ -47,5 +48,11 @@ void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info); int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info); void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup); +int btrfs_read_policy_to_enum(const char *str, s64 *value); + +#ifdef CONFIG_BTRFS_EXPERIMENTAL +int __init btrfs_read_policy_init(void); +char *btrfs_get_mod_read_policy(void); +#endif #endif diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index e607b5d52fb1..b576897d71cc 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -30,6 +30,7 @@ const char *test_error[] = { [TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map", [TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map", [TEST_ALLOC_IO_CONTEXT] = "cannot allocate io context", + [TEST_ALLOC_TRANSACTION] = "cannot allocate transaction", }; static const struct super_operations btrfs_test_super_ops = { @@ -101,7 +102,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info) if (!dev) return ERR_PTR(-ENOMEM); - extent_io_tree_init(fs_info, &dev->alloc_state, 0); + btrfs_extent_io_tree_init(fs_info, &dev->alloc_state, 0); INIT_LIST_HEAD(&dev->dev_list); list_add(&dev->dev_list, &fs_info->fs_devices->devices); @@ -110,7 +111,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info) static void btrfs_free_dummy_device(struct btrfs_device *dev) { - extent_io_tree_release(&dev->alloc_state); + btrfs_extent_io_tree_release(&dev->alloc_state); kfree(dev); } @@ -142,6 +143,11 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); + + /* CRC32C csum size. */ + fs_info->csum_size = 4; + fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / + fs_info->csum_size; set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); test_mnt->mnt_sb->s_fs_info = fs_info; @@ -151,9 +157,9 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) { - struct radix_tree_iter iter; - void **slot; struct btrfs_device *dev, *tmp; + struct extent_buffer *eb; + unsigned long index; if (!fs_info) return; @@ -163,25 +169,13 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) test_mnt->mnt_sb->s_fs_info = NULL; - spin_lock(&fs_info->buffer_lock); - radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { - struct extent_buffer *eb; - - eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock); - if (!eb) - continue; - /* Shouldn't happen but that kind of thinking creates CVE's */ - if (radix_tree_exception(eb)) { - if (radix_tree_deref_retry(eb)) - slot = radix_tree_iter_retry(&iter); - continue; - } - slot = radix_tree_iter_resume(slot, &iter); - spin_unlock(&fs_info->buffer_lock); - free_extent_buffer_stale(eb); - spin_lock(&fs_info->buffer_lock); + xa_lock_irq(&fs_info->buffer_tree); + xa_for_each(&fs_info->buffer_tree, index, eb) { + xa_unlock_irq(&fs_info->buffer_tree); + free_extent_buffer(eb); + xa_lock_irq(&fs_info->buffer_tree); } - spin_unlock(&fs_info->buffer_lock); + xa_unlock_irq(&fs_info->buffer_tree); btrfs_mapping_tree_free(fs_info); list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices, @@ -247,6 +241,15 @@ void btrfs_free_dummy_block_group(struct btrfs_block_group *cache) kfree(cache); } +void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info) +{ + memset(trans, 0, sizeof(*trans)); + trans->fs_info = fs_info; + xa_init(&trans->delayed_refs.head_refs); + xa_init(&trans->delayed_refs.dirty_extents); + spin_lock_init(&trans->delayed_refs.lock); +} + void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { @@ -295,6 +298,9 @@ int btrfs_run_sanity_tests(void) ret = btrfs_test_raid_stripe_tree(sectorsize, nodesize); if (ret) goto out; + ret = btrfs_test_delayed_refs(sectorsize, nodesize); + if (ret) + goto out; } } ret = btrfs_test_extent_map(); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index b524ecf2f452..4307bdaa6749 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -6,6 +6,8 @@ #ifndef BTRFS_TESTS_H #define BTRFS_TESTS_H +#include <linux/types.h> + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int btrfs_run_sanity_tests(void); @@ -25,12 +27,14 @@ enum { TEST_ALLOC_EXTENT_MAP, TEST_ALLOC_CHUNK_MAP, TEST_ALLOC_IO_CONTEXT, + TEST_ALLOC_TRANSACTION, }; extern const char *test_error[]; struct btrfs_root; struct btrfs_trans_handle; +struct btrfs_transaction; int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize); int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize); @@ -40,6 +44,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize); int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize); int btrfs_test_extent_map(void); +int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize); void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info); @@ -49,6 +54,7 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long lengt void btrfs_free_dummy_block_group(struct btrfs_block_group *cache); void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); +void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info); #else static inline int btrfs_run_sanity_tests(void) diff --git a/fs/btrfs/tests/delayed-refs-tests.c b/fs/btrfs/tests/delayed-refs-tests.c new file mode 100644 index 000000000000..265370e79a54 --- /dev/null +++ b/fs/btrfs/tests/delayed-refs-tests.c @@ -0,0 +1,1016 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/sizes.h> +#include "btrfs-tests.h" +#include "../transaction.h" +#include "../delayed-ref.h" +#include "../extent-tree.h" + +#define FAKE_ROOT_OBJECTID 256 +#define FAKE_BYTENR 0 +#define FAKE_LEVEL 1 +#define FAKE_INO 256 +#define FAKE_FILE_OFFSET 0 +#define FAKE_PARENT SZ_1M + +struct ref_head_check { + u64 bytenr; + u64 num_bytes; + int ref_mod; + int total_ref_mod; + int must_insert; +}; + +struct ref_node_check { + u64 bytenr; + u64 num_bytes; + int ref_mod; + enum btrfs_delayed_ref_action action; + u8 type; + u64 parent; + u64 root; + u64 owner; + u64 offset; +}; + +static enum btrfs_ref_type ref_type_from_disk_ref_type(u8 type) +{ + if ((type == BTRFS_TREE_BLOCK_REF_KEY) || + (type == BTRFS_SHARED_BLOCK_REF_KEY)) + return BTRFS_REF_METADATA; + return BTRFS_REF_DATA; +} + +static void delete_delayed_ref_head(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + + spin_lock(&delayed_refs->lock); + spin_lock(&head->lock); + btrfs_delete_ref_head(fs_info, delayed_refs, head); + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); + + btrfs_delayed_ref_unlock(head); + btrfs_put_delayed_ref_head(head); +} + +static void delete_delayed_ref_node(struct btrfs_delayed_ref_head *head, + struct btrfs_delayed_ref_node *node) +{ + rb_erase_cached(&node->ref_node, &head->ref_tree); + RB_CLEAR_NODE(&node->ref_node); + if (!list_empty(&node->add_list)) + list_del_init(&node->add_list); + btrfs_put_delayed_ref(node); +} + +static int validate_ref_head(struct btrfs_delayed_ref_head *head, + struct ref_head_check *check) +{ + if (head->bytenr != check->bytenr) { + test_err("invalid bytenr have: %llu want: %llu", head->bytenr, + check->bytenr); + return -EINVAL; + } + + if (head->num_bytes != check->num_bytes) { + test_err("invalid num_bytes have: %llu want: %llu", + head->num_bytes, check->num_bytes); + return -EINVAL; + } + + if (head->ref_mod != check->ref_mod) { + test_err("invalid ref_mod have: %d want: %d", head->ref_mod, + check->ref_mod); + return -EINVAL; + } + + if (head->total_ref_mod != check->total_ref_mod) { + test_err("invalid total_ref_mod have: %d want: %d", + head->total_ref_mod, check->total_ref_mod); + return -EINVAL; + } + + if (head->must_insert_reserved != check->must_insert) { + test_err("invalid must_insert have: %d want: %d", + head->must_insert_reserved, check->must_insert); + return -EINVAL; + } + + return 0; +} + +static int validate_ref_node(struct btrfs_delayed_ref_node *node, + struct ref_node_check *check) +{ + if (node->bytenr != check->bytenr) { + test_err("invalid bytenr have: %llu want: %llu", node->bytenr, + check->bytenr); + return -EINVAL; + } + + if (node->num_bytes != check->num_bytes) { + test_err("invalid num_bytes have: %llu want: %llu", + node->num_bytes, check->num_bytes); + return -EINVAL; + } + + if (node->ref_mod != check->ref_mod) { + test_err("invalid ref_mod have: %d want: %d", node->ref_mod, + check->ref_mod); + return -EINVAL; + } + + if (node->action != check->action) { + test_err("invalid action have: %d want: %d", node->action, + check->action); + return -EINVAL; + } + + if (node->parent != check->parent) { + test_err("invalid parent have: %llu want: %llu", node->parent, + check->parent); + return -EINVAL; + } + + if (node->ref_root != check->root) { + test_err("invalid root have: %llu want: %llu", node->ref_root, + check->root); + return -EINVAL; + } + + if (node->type != check->type) { + test_err("invalid type have: %d want: %d", node->type, + check->type); + return -EINVAL; + } + + if (btrfs_delayed_ref_owner(node) != check->owner) { + test_err("invalid owner have: %llu want: %llu", + btrfs_delayed_ref_owner(node), check->owner); + return -EINVAL; + } + + if (btrfs_delayed_ref_offset(node) != check->offset) { + test_err("invalid offset have: %llu want: %llu", + btrfs_delayed_ref_offset(node), check->offset); + return -EINVAL; + } + + return 0; +} + +static int simple_test(struct btrfs_trans_handle *trans, + struct ref_head_check *head_check, + struct ref_node_check *node_check) +{ + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_node *node; + struct btrfs_ref ref = { + .type = ref_type_from_disk_ref_type(node_check->type), + .action = node_check->action, + .parent = node_check->parent, + .ref_root = node_check->root, + .bytenr = node_check->bytenr, + .num_bytes = fs_info->nodesize, + }; + int ret; + + if (ref.type == BTRFS_REF_METADATA) + btrfs_init_tree_ref(&ref, node_check->owner, node_check->root, + false); + else + btrfs_init_data_ref(&ref, node_check->owner, node_check->offset, + node_check->root, true); + + if (ref.type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + return ret; + } + + head = btrfs_select_ref_head(fs_info, delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + return -EINVAL; + } + + ret = -EINVAL; + if (validate_ref_head(head, head_check)) + goto out; + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, node_check)) + goto out; + ret = 0; +out: + btrfs_unselect_ref_head(delayed_refs, head); + btrfs_destroy_delayed_refs(trans->transaction); + return ret; +} + +/* + * These are simple tests, make sure that our btrfs_ref's get turned into the + * appropriate btrfs_delayed_ref_node based on their settings and action. + */ +static int simple_tests(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct ref_head_check head_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 1, + .total_ref_mod = 1, + }; + struct ref_node_check node_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 1, + .action = BTRFS_ADD_DELAYED_REF, + .type = BTRFS_TREE_BLOCK_REF_KEY, + .parent = 0, + .root = FAKE_ROOT_OBJECTID, + .owner = FAKE_LEVEL, + .offset = 0, + }; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single add tree block failed"); + return -EINVAL; + } + + node_check.type = BTRFS_EXTENT_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single add extent data failed"); + return -EINVAL; + } + + node_check.parent = FAKE_PARENT; + node_check.type = BTRFS_SHARED_BLOCK_REF_KEY; + node_check.owner = FAKE_LEVEL; + node_check.offset = 0; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single add shared block failed"); + return -EINVAL; + } + + node_check.type = BTRFS_SHARED_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single add shared data failed"); + return -EINVAL; + } + + head_check.ref_mod = -1; + head_check.total_ref_mod = -1; + node_check.action = BTRFS_DROP_DELAYED_REF; + node_check.type = BTRFS_TREE_BLOCK_REF_KEY; + node_check.owner = FAKE_LEVEL; + node_check.offset = 0; + node_check.parent = 0; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single drop tree block failed"); + return -EINVAL; + } + + node_check.type = BTRFS_EXTENT_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single drop extent data failed"); + return -EINVAL; + } + + node_check.parent = FAKE_PARENT; + node_check.type = BTRFS_SHARED_BLOCK_REF_KEY; + node_check.owner = FAKE_LEVEL; + node_check.offset = 0; + if (simple_test(trans, &head_check, &node_check)) { + test_err("single drop shared block failed"); + return -EINVAL; + } + + node_check.type = BTRFS_SHARED_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + if (simple_test(trans, &head_check, &node_check)) { + test_err("single drop shared data failed"); + return -EINVAL; + } + + return 0; +} + +/* + * Merge tests, validate that we do delayed ref merging properly, the ref counts + * all end up properly, and delayed refs are deleted once they're no longer + * needed. + */ +static int merge_tests(struct btrfs_trans_handle *trans, + enum btrfs_ref_type type) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_head *head = NULL; + struct btrfs_delayed_ref_node *node; + struct btrfs_ref ref = { + .type = type, + .action = BTRFS_ADD_DELAYED_REF, + .parent = 0, + .ref_root = FAKE_ROOT_OBJECTID, + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + }; + struct ref_head_check head_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 0, + .total_ref_mod = 0, + }; + struct ref_node_check node_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 2, + .action = BTRFS_ADD_DELAYED_REF, + .parent = 0, + .root = FAKE_ROOT_OBJECTID, + }; + int ret; + + /* + * First add a ref and then drop it, make sure we get a head ref with a + * 0 total ref mod and no nodes. + */ + if (type == BTRFS_REF_METADATA) { + node_check.type = BTRFS_TREE_BLOCK_REF_KEY; + node_check.owner = FAKE_LEVEL; + btrfs_init_tree_ref(&ref, FAKE_LEVEL, FAKE_ROOT_OBJECTID, false); + } else { + node_check.type = BTRFS_EXTENT_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + btrfs_init_data_ref(&ref, FAKE_INO, FAKE_FILE_OFFSET, + FAKE_ROOT_OBJECTID, true); + } + + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + return ret; + } + + ref.action = BTRFS_DROP_DELAYED_REF; + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + goto out; + } + + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("single add and drop failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + + delete_delayed_ref_head(trans, head); + head = NULL; + + /* + * Add a ref, then add another ref, make sure we get a head ref with a + * 2 total ref mod and 1 node. + */ + ref.action = BTRFS_ADD_DELAYED_REF; + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + goto out; + } + + head_check.ref_mod = 2; + head_check.total_ref_mod = 2; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("double add failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + delete_delayed_ref_head(trans, head); + head = NULL; + + /* Add two drop refs, make sure they are merged properly. */ + ref.action = BTRFS_DROP_DELAYED_REF; + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + goto out; + } + + head_check.ref_mod = -2; + head_check.total_ref_mod = -2; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("double drop failed"); + goto out; + } + + node_check.action = BTRFS_DROP_DELAYED_REF; + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + delete_delayed_ref_head(trans, head); + head = NULL; + + /* Add multiple refs, then drop until we go negative again. */ + ref.action = BTRFS_ADD_DELAYED_REF; + for (int i = 0; i < 10; i++) { + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + ref.action = BTRFS_DROP_DELAYED_REF; + for (int i = 0; i < 12; i++) { + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + goto out; + } + + head_check.ref_mod = -2; + head_check.total_ref_mod = -2; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("double drop failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + + delete_delayed_ref_head(trans, head); + head = NULL; + + /* Drop multiple refs, then add until we go positive again. */ + ref.action = BTRFS_DROP_DELAYED_REF; + for (int i = 0; i < 10; i++) { + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + ref.action = BTRFS_ADD_DELAYED_REF; + for (int i = 0; i < 12; i++) { + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + goto out; + } + + head_check.ref_mod = 2; + head_check.total_ref_mod = 2; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("add and drop to positive failed"); + goto out; + } + + node_check.action = BTRFS_ADD_DELAYED_REF; + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + delete_delayed_ref_head(trans, head); + head = NULL; + + /* + * Add a bunch of refs with different roots and parents, then drop them + * all, make sure everything is properly merged. + */ + ref.action = BTRFS_ADD_DELAYED_REF; + for (int i = 0; i < 50; i++) { + if (!(i % 2)) { + ref.parent = 0; + ref.ref_root = FAKE_ROOT_OBJECTID + i; + } else { + ref.parent = FAKE_PARENT + (i * fs_info->nodesize); + } + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + ref.action = BTRFS_DROP_DELAYED_REF; + for (int i = 0; i < 50; i++) { + if (!(i % 2)) { + ref.parent = 0; + ref.ref_root = FAKE_ROOT_OBJECTID + i; + } else { + ref.parent = FAKE_PARENT + (i * fs_info->nodesize); + } + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + goto out; + } + + head_check.ref_mod = 0; + head_check.total_ref_mod = 0; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("add and drop multiple failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + ret = 0; +out: + if (!IS_ERR_OR_NULL(head)) + btrfs_unselect_ref_head(&trans->transaction->delayed_refs, head); + btrfs_destroy_delayed_refs(trans->transaction); + return ret; +} + +/* + * Basic test to validate we always get the add operations first followed by any + * delete operations. + */ +static int select_delayed_refs_test(struct btrfs_trans_handle *trans) +{ + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_head *head = NULL; + struct btrfs_delayed_ref_node *node; + struct btrfs_ref ref = { + .type = BTRFS_REF_METADATA, + .action = BTRFS_DROP_DELAYED_REF, + .parent = 0, + .ref_root = FAKE_ROOT_OBJECTID, + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + }; + struct ref_head_check head_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 0, + .total_ref_mod = 0, + }; + struct ref_node_check node_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 1, + .action = BTRFS_ADD_DELAYED_REF, + .type = BTRFS_TREE_BLOCK_REF_KEY, + .parent = 0, + .owner = FAKE_LEVEL, + .offset = 0, + }; + int ret; + + /* Add the drop first. */ + btrfs_init_tree_ref(&ref, FAKE_LEVEL, FAKE_ROOT_OBJECTID, false); + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + return ret; + } + + /* + * Now add the add, and make it a different root so it's logically later + * in the rb tree. + */ + ref.action = BTRFS_ADD_DELAYED_REF; + ref.ref_root = FAKE_ROOT_OBJECTID + 1; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + head = NULL; + goto out; + } + + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("head check failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + node_check.root = FAKE_ROOT_OBJECTID + 1; + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + node_check.action = BTRFS_DROP_DELAYED_REF; + node_check.root = FAKE_ROOT_OBJECTID; + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + delete_delayed_ref_node(head, node); + delete_delayed_ref_head(trans, head); + head = NULL; + + /* + * Now we're going to do the same thing, but we're going to have an add + * that gets deleted because of a merge, and make sure we still have + * another add in place. + */ + ref.action = BTRFS_DROP_DELAYED_REF; + ref.ref_root = FAKE_ROOT_OBJECTID; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + ref.action = BTRFS_ADD_DELAYED_REF; + ref.ref_root = FAKE_ROOT_OBJECTID + 1; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + ref.action = BTRFS_DROP_DELAYED_REF; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + ref.action = BTRFS_ADD_DELAYED_REF; + ref.ref_root = FAKE_ROOT_OBJECTID + 2; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + head = NULL; + goto out; + } + + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("head check failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + node_check.action = BTRFS_ADD_DELAYED_REF; + node_check.root = FAKE_ROOT_OBJECTID + 2; + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + node_check.action = BTRFS_DROP_DELAYED_REF; + node_check.root = FAKE_ROOT_OBJECTID; + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + delete_delayed_ref_node(head, node); + ret = 0; +out: + if (head) + btrfs_unselect_ref_head(delayed_refs, head); + btrfs_destroy_delayed_refs(trans->transaction); + return ret; +} + +int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize) +{ + struct btrfs_transaction *transaction; + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info; + int ret; + + test_msg("running delayed refs tests"); + + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } + transaction = kmalloc(sizeof(*transaction), GFP_KERNEL); + if (!transaction) { + test_std_err(TEST_ALLOC_TRANSACTION); + ret = -ENOMEM; + goto out_free_fs_info; + } + btrfs_init_dummy_trans(&trans, fs_info); + btrfs_init_dummy_transaction(transaction, fs_info); + trans.transaction = transaction; + + ret = simple_tests(&trans); + if (!ret) { + test_msg("running delayed refs merg tests on metadata refs"); + ret = merge_tests(&trans, BTRFS_REF_METADATA); + } + + if (!ret) { + test_msg("running delayed refs merg tests on data refs"); + ret = merge_tests(&trans, BTRFS_REF_DATA); + } + + if (!ret) + ret = select_delayed_refs_test(&trans); + + kfree(transaction); +out_free_fs_info: + btrfs_free_dummy_fs_info(fs_info); + return ret; +} diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 0a2dbfaaf49e..b19328d077d3 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -14,17 +14,17 @@ #include "../disk-io.h" #include "../btrfs_inode.h" -#define PROCESS_UNLOCK (1 << 0) -#define PROCESS_RELEASE (1 << 1) -#define PROCESS_TEST_LOCKED (1 << 2) +#define PROCESS_UNLOCK (1U << 0) +#define PROCESS_RELEASE (1U << 1) +#define PROCESS_TEST_LOCKED (1U << 2) static noinline int process_page_range(struct inode *inode, u64 start, u64 end, unsigned long flags) { int ret; struct folio_batch fbatch; - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; + pgoff_t index = start >> PAGE_SHIFT; + pgoff_t end_index = end >> PAGE_SHIFT; int i; int count = 0; int loops = 0; @@ -74,9 +74,9 @@ static void extent_flag_to_str(const struct extent_state *state, char *dest) dest[0] = 0; PRINT_ONE_FLAG(state, dest, cur, DIRTY); - PRINT_ONE_FLAG(state, dest, cur, UPTODATE); PRINT_ONE_FLAG(state, dest, cur, LOCKED); - PRINT_ONE_FLAG(state, dest, cur, NEW); + PRINT_ONE_FLAG(state, dest, cur, DIRTY_LOG1); + PRINT_ONE_FLAG(state, dest, cur, DIRTY_LOG2); PRINT_ONE_FLAG(state, dest, cur, DELALLOC); PRINT_ONE_FLAG(state, dest, cur, DEFRAG); PRINT_ONE_FLAG(state, dest, cur, BOUNDARY); @@ -114,7 +114,6 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) struct extent_io_tree *tmp; struct page *page; struct page *locked_page = NULL; - unsigned long index = 0; /* In this test we need at least 2 file extents at its maximum size */ u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; u64 total_dirty = 2 * max_bytes; @@ -150,14 +149,14 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) * Passing NULL as we don't have fs_info but tracepoints are not used * at this point */ - extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST); + btrfs_extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST); /* * First go through and create and mark all of our pages dirty, we pin * everything to make sure our pages don't get evicted and screw up our * test. */ - for (index = 0; index < (total_dirty >> PAGE_SHIFT); index++) { + for (pgoff_t index = 0; index < (total_dirty >> PAGE_SHIFT); index++) { page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL); if (!page) { test_err("failed to allocate test page"); @@ -177,7 +176,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) * |--- delalloc ---| * |--- search ---| */ - set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL); + btrfs_set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL); start = 0; end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, @@ -191,7 +190,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) sectorsize - 1, start, end); goto out_bits; } - unlock_extent(tmp, start, end, NULL); + btrfs_unlock_extent(tmp, start, end, NULL); unlock_page(locked_page); put_page(locked_page); @@ -208,7 +207,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) test_err("couldn't find the locked page"); goto out_bits; } - set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL); + btrfs_set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, @@ -227,7 +226,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) test_err("there were unlocked pages in the range"); goto out_bits; } - unlock_extent(tmp, start, end, NULL); + btrfs_unlock_extent(tmp, start, end, NULL); /* locked_page was unlocked above */ put_page(locked_page); @@ -263,7 +262,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) * * We are re-using our test_start from above since it works out well. */ - set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL); + btrfs_set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, @@ -282,7 +281,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) test_err("pages in range were not all locked"); goto out_bits; } - unlock_extent(tmp, start, end, NULL); + btrfs_unlock_extent(tmp, start, end, NULL); /* * Now to test where we run into a page that is no longer dirty in the @@ -327,7 +326,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) out_bits: if (ret) dump_extent_io_tree(tmp); - clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1); + btrfs_clear_extent_bit(tmp, 0, total_dirty - 1, (unsigned)-1, NULL); out: if (locked_page) put_page(locked_page); @@ -344,11 +343,11 @@ static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb) unsigned long i; for (i = 0; i < eb->len * BITS_PER_BYTE; i++) { - int bit, bit1; + bool bit_set, bit1_set; - bit = !!test_bit(i, bitmap); - bit1 = !!extent_buffer_test_bit(eb, 0, i); - if (bit1 != bit) { + bit_set = test_bit(i, bitmap); + bit1_set = extent_buffer_test_bit(eb, 0, i); + if (bit1_set != bit_set) { u8 has; u8 expect; @@ -361,9 +360,9 @@ static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb) return -EINVAL; } - bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE, - i % BITS_PER_BYTE); - if (bit1 != bit) { + bit1_set = extent_buffer_test_bit(eb, i / BITS_PER_BYTE, + i % BITS_PER_BYTE); + if (bit1_set != bit_set) { u8 has; u8 expect; @@ -525,7 +524,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) goto out; } - eb = __alloc_dummy_extent_buffer(fs_info, 0, nodesize); + eb = alloc_dummy_extent_buffer(fs_info, 0); if (!eb) { test_std_err(TEST_ALLOC_ROOT); ret = -ENOMEM; @@ -542,7 +541,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) * Test again for case where the tree block is sectorsize aligned but * not nodesize aligned. */ - eb = __alloc_dummy_extent_buffer(fs_info, sectorsize, nodesize); + eb = alloc_dummy_extent_buffer(fs_info, sectorsize); if (!eb) { test_std_err(TEST_ALLOC_ROOT); ret = -ENOMEM; @@ -565,10 +564,10 @@ static int test_find_first_clear_extent_bit(void) test_msg("running find_first_clear_extent_bit test"); - extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST); + btrfs_extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST); /* Test correct handling of empty tree */ - find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED); + btrfs_find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED); if (start != 0 || end != -1) { test_err( "error getting a range from completely empty tree: start %llu end %llu", @@ -579,11 +578,11 @@ static int test_find_first_clear_extent_bit(void) * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between * 4M-32M */ - set_extent_bit(&tree, SZ_1M, SZ_4M - 1, - CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); + btrfs_set_extent_bit(&tree, SZ_1M, SZ_4M - 1, + CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); - find_first_clear_extent_bit(&tree, SZ_512K, &start, &end, - CHUNK_TRIMMED | CHUNK_ALLOCATED); + btrfs_find_first_clear_extent_bit(&tree, SZ_512K, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); if (start != 0 || end != SZ_1M - 1) { test_err("error finding beginning range: start %llu end %llu", @@ -592,14 +591,14 @@ static int test_find_first_clear_extent_bit(void) } /* Now add 32M-64M so that we have a hole between 4M-32M */ - set_extent_bit(&tree, SZ_32M, SZ_64M - 1, - CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); + btrfs_set_extent_bit(&tree, SZ_32M, SZ_64M - 1, + CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); /* * Request first hole starting at 12M, we should get 4M-32M */ - find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end, - CHUNK_TRIMMED | CHUNK_ALLOCATED); + btrfs_find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); if (start != SZ_4M || end != SZ_32M - 1) { test_err("error finding trimmed range: start %llu end %llu", @@ -611,8 +610,8 @@ static int test_find_first_clear_extent_bit(void) * Search in the middle of allocated range, should get the next one * available, which happens to be unallocated -> 4M-32M */ - find_first_clear_extent_bit(&tree, SZ_2M, &start, &end, - CHUNK_TRIMMED | CHUNK_ALLOCATED); + btrfs_find_first_clear_extent_bit(&tree, SZ_2M, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); if (start != SZ_4M || end != SZ_32M - 1) { test_err("error finding next unalloc range: start %llu end %llu", @@ -624,9 +623,9 @@ static int test_find_first_clear_extent_bit(void) * Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag * being unset in this range, we should get the entry in range 64M-72M */ - set_extent_bit(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED, NULL); - find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end, - CHUNK_TRIMMED); + btrfs_set_extent_bit(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED, NULL); + btrfs_find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end, + CHUNK_TRIMMED); if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) { test_err("error finding exact range: start %llu end %llu", @@ -634,8 +633,8 @@ static int test_find_first_clear_extent_bit(void) goto out; } - find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end, - CHUNK_TRIMMED); + btrfs_find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end, + CHUNK_TRIMMED); /* * Search in the middle of set range whose immediate neighbour doesn't @@ -651,7 +650,7 @@ static int test_find_first_clear_extent_bit(void) * Search beyond any known range, shall return after last known range * and end should be -1 */ - find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED); + btrfs_find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED); if (start != SZ_64M + SZ_8M || end != -1) { test_err( "error handling beyond end of range search: start %llu end %llu", @@ -663,7 +662,7 @@ static int test_find_first_clear_extent_bit(void) out: if (ret) dump_extent_io_tree(&tree); - clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED); + btrfs_clear_extent_bit(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); return ret; } @@ -730,7 +729,7 @@ static int test_eb_mem_ops(u32 sectorsize, u32 nodesize) goto out; } - eb = __alloc_dummy_extent_buffer(fs_info, SZ_1M, nodesize); + eb = alloc_dummy_extent_buffer(fs_info, SZ_1M); if (!eb) { test_std_err(TEST_ALLOC_EXTENT_BUFFER); ret = -ENOMEM; diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 56e61ac1cc64..3a86534c116f 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -22,7 +22,7 @@ static int free_extent_map_tree(struct btrfs_inode *inode) while (!RB_EMPTY_ROOT(&em_tree->root)) { node = rb_first(&em_tree->root); em = rb_entry(node, struct extent_map, rb_node); - remove_extent_mapping(inode, em); + btrfs_remove_extent_mapping(inode, em); #ifdef CONFIG_BTRFS_DEBUG if (refcount_read(&em->refs) != 1) { @@ -36,7 +36,7 @@ static int free_extent_map_tree(struct btrfs_inode *inode) refcount_set(&em->refs, 1); } #endif - free_extent_map(em); + btrfs_free_extent_map(em); } write_unlock(&em_tree->lock); @@ -68,7 +68,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) int ret; int ret2; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -87,10 +87,10 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_err("cannot add extent range [0, 16K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); /* Add [16K, 20K) following [0, 16K) */ - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -109,9 +109,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_err("cannot add extent range [16K, 20K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -137,7 +137,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) ret = -ENOENT; goto out; } - if (em->start != 0 || extent_map_end(em) != SZ_16K || + if (em->start != 0 || btrfs_extent_map_end(em) != SZ_16K || em->disk_bytenr != 0 || em->disk_num_bytes != SZ_16K) { test_err( "case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu", @@ -145,7 +145,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) em->disk_bytenr, em->disk_num_bytes); ret = -EINVAL; } - free_extent_map(em); + btrfs_free_extent_map(em); out: ret2 = free_extent_map_tree(inode); if (ret == 0) @@ -167,7 +167,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) int ret; int ret2; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -186,10 +186,10 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_err("cannot add extent range [0, 1K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); /* Add [4K, 8K) following [0, 1K) */ - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -208,9 +208,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_err("cannot add extent range [4K, 8K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -235,14 +235,14 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) ret = -ENOENT; goto out; } - if (em->start != 0 || extent_map_end(em) != SZ_1K || + if (em->start != 0 || btrfs_extent_map_end(em) != SZ_1K || em->disk_bytenr != EXTENT_MAP_INLINE) { test_err( "case2 [0 1K]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu", ret, em->start, em->len, em->disk_bytenr); ret = -EINVAL; } - free_extent_map(em); + btrfs_free_extent_map(em); out: ret2 = free_extent_map_tree(inode); if (ret == 0) @@ -260,7 +260,7 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, int ret; int ret2; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -279,9 +279,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, test_err("cannot add extent range [4K, 8K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -312,15 +312,15 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, * Since bytes within em are contiguous, em->block_start is identical to * em->start. */ - if (start < em->start || start + len > extent_map_end(em) || - em->start != extent_map_block_start(em)) { + if (start < em->start || start + len > btrfs_extent_map_end(em) || + em->start != btrfs_extent_map_block_start(em)) { test_err( "case3 [%llu %llu): ret %d em (start %llu len %llu disk_bytenr %llu block_len %llu)", start, start + len, ret, em->start, em->len, em->disk_bytenr, em->disk_num_bytes); ret = -EINVAL; } - free_extent_map(em); + btrfs_free_extent_map(em); out: ret2 = free_extent_map_tree(inode); if (ret == 0) @@ -369,7 +369,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, int ret; int ret2; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -388,9 +388,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, test_err("cannot add extent range [0, 8K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -410,9 +410,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, test_err("cannot add extent range [8K, 32K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -438,14 +438,14 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, ret = -ENOENT; goto out; } - if (start < em->start || start + len > extent_map_end(em)) { + if (start < em->start || start + len > btrfs_extent_map_end(em)) { test_err( "case4 [%llu %llu): ret %d, added wrong em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu)", start, start + len, ret, em->start, em->len, em->disk_bytenr, em->disk_num_bytes); ret = -EINVAL; } - free_extent_map(em); + btrfs_free_extent_map(em); out: ret2 = free_extent_map_tree(inode); if (ret == 0) @@ -498,7 +498,7 @@ static int add_compressed_extent(struct btrfs_inode *inode, struct extent_map *em; int ret; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -513,7 +513,7 @@ static int add_compressed_extent(struct btrfs_inode *inode, write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); - free_extent_map(em); + btrfs_free_extent_map(em); if (ret < 0) { test_err("cannot add extent map [%llu, %llu)", start, start + len); return ret; @@ -719,7 +719,7 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) if (ret) goto out; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -751,7 +751,7 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) } ret = 0; out: - free_extent_map(em); + btrfs_free_extent_map(em); ret2 = free_extent_map_tree(inode); if (ret == 0) ret = ret2; @@ -773,7 +773,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_msg("Running btrfs_drop_extent_cache with pinned"); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -793,9 +793,9 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_err("couldn't add extent map"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -815,7 +815,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_err("couldn't add extent map"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); /* * Drop [0, 36K) This should skip the [0, 4K) extent and then split the @@ -826,7 +826,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) /* Make sure our extent maps look sane. */ ret = -EINVAL; - em = lookup_extent_mapping(em_tree, 0, SZ_16K); + em = btrfs_lookup_extent_mapping(em_tree, 0, SZ_16K); if (!em) { test_err("didn't find an em at 0 as expected"); goto out; @@ -842,10 +842,10 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, SZ_16K, SZ_16K); + em = btrfs_lookup_extent_mapping(em_tree, SZ_16K, SZ_16K); read_unlock(&em_tree->lock); if (em) { test_err("found an em when we weren't expecting one"); @@ -853,7 +853,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) } read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, SZ_32K, SZ_16K); + em = btrfs_lookup_extent_mapping(em_tree, SZ_32K, SZ_16K); read_unlock(&em_tree->lock); if (!em) { test_err("didn't find an em at 32K as expected"); @@ -870,16 +870,16 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) goto out; } - if (extent_map_block_start(em) != SZ_32K + SZ_4K) { + if (btrfs_extent_map_block_start(em) != SZ_32K + SZ_4K) { test_err("em->block_start is %llu, expected 36K", - extent_map_block_start(em)); + btrfs_extent_map_block_start(em)); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, 48 * SZ_1K, (u64)-1); + em = btrfs_lookup_extent_mapping(em_tree, 48 * SZ_1K, (u64)-1); read_unlock(&em_tree->lock); if (em) { test_err("found an unexpected em above 48K"); @@ -888,9 +888,9 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) ret = 0; out: - free_extent_map(em); + btrfs_free_extent_map(em); /* Unpin our extent to prevent warning when removing it below. */ - ret2 = unpin_extent_cache(inode, 0, SZ_16K, 0); + ret2 = btrfs_unpin_extent_cache(inode, 0, SZ_16K, 0); if (ret == 0) ret = ret2; ret2 = free_extent_map_tree(inode); @@ -913,7 +913,7 @@ static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) int ret; int ret2; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -928,13 +928,13 @@ static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); - free_extent_map(em); + btrfs_free_extent_map(em); if (ret < 0) { test_err("couldn't add extent map for range [120K, 128K)"); goto out; } - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -967,7 +967,7 @@ static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(inode, &em, SZ_1K * 140, SZ_4K); write_unlock(&em_tree->lock); - free_extent_map(em); + btrfs_free_extent_map(em); if (ret < 0) { test_err("couldn't add extent map for range [108K, 144K)"); goto out; @@ -1045,6 +1045,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, ret = btrfs_add_chunk_map(fs_info, map); if (ret) { test_err("error adding chunk map to mapping tree"); + btrfs_free_chunk_map(map); goto out_free; } diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index b61972046feb..c8822edd32e2 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -32,7 +32,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans, unsigned int i; int ret; - info = search_free_space_info(trans, cache, path, 0); + info = btrfs_search_free_space_info(trans, cache, path, 0); if (IS_ERR(info)) { test_err("could not find free space info"); ret = PTR_ERR(info); @@ -57,7 +57,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans, goto invalid; offset = key.objectid; while (offset < key.objectid + key.offset) { - bit = free_space_test_bit(cache, path, offset); + bit = btrfs_free_space_test_bit(cache, path, offset); if (prev_bit == 0 && bit == 1) { extent_start = offset; } else if (prev_bit == 1 && bit == 0) { @@ -115,7 +115,7 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans, u32 flags; int ret; - info = search_free_space_info(trans, cache, path, 0); + info = btrfs_search_free_space_info(trans, cache, path, 0); if (IS_ERR(info)) { test_err("could not find free space info"); btrfs_release_path(path); @@ -131,13 +131,13 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans, /* Flip it to the other format and check that for good measure. */ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { - ret = convert_free_space_to_extents(trans, cache, path); + ret = btrfs_convert_free_space_to_extents(trans, cache, path); if (ret) { test_err("could not convert to extents"); return ret; } } else { - ret = convert_free_space_to_bitmaps(trans, cache, path); + ret = btrfs_convert_free_space_to_bitmaps(trans, cache, path); if (ret) { test_err("could not convert to bitmaps"); return ret; @@ -170,9 +170,8 @@ static int test_remove_all(struct btrfs_trans_handle *trans, const struct free_space_extent extents[] = {}; int ret; - ret = __remove_from_free_space_tree(trans, cache, path, - cache->start, - cache->length); + ret = __btrfs_remove_from_free_space_tree(trans, cache, path, + cache->start, cache->length); if (ret) { test_err("could not remove free space"); return ret; @@ -193,8 +192,8 @@ static int test_remove_beginning(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, cache, path, - cache->start, alignment); + ret = __btrfs_remove_from_free_space_tree(trans, cache, path, + cache->start, alignment); if (ret) { test_err("could not remove free space"); return ret; @@ -216,7 +215,7 @@ static int test_remove_end(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, cache, path, + ret = __btrfs_remove_from_free_space_tree(trans, cache, path, cache->start + cache->length - alignment, alignment); if (ret) { @@ -240,9 +239,9 @@ static int test_remove_middle(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, cache, path, - cache->start + alignment, - alignment); + ret = __btrfs_remove_from_free_space_tree(trans, cache, path, + cache->start + alignment, + alignment); if (ret) { test_err("could not remove free space"); return ret; @@ -263,23 +262,22 @@ static int test_merge_left(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, cache, path, - cache->start, cache->length); + ret = __btrfs_remove_from_free_space_tree(trans, cache, path, + cache->start, cache->length); if (ret) { test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, cache->start, - alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, cache->start, + alignment); if (ret) { test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, - cache->start + alignment, - alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, + cache->start + alignment, alignment); if (ret) { test_err("could not add free space"); return ret; @@ -300,24 +298,23 @@ static int test_merge_right(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, cache, path, - cache->start, cache->length); + ret = __btrfs_remove_from_free_space_tree(trans, cache, path, + cache->start, cache->length); if (ret) { test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, - cache->start + 2 * alignment, - alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, + cache->start + 2 * alignment, + alignment); if (ret) { test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, - cache->start + alignment, - alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, + cache->start + alignment, alignment); if (ret) { test_err("could not add free space"); return ret; @@ -338,29 +335,29 @@ static int test_merge_both(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, cache, path, - cache->start, cache->length); + ret = __btrfs_remove_from_free_space_tree(trans, cache, path, + cache->start, cache->length); if (ret) { test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, cache->start, - alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, cache->start, + alignment); if (ret) { test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, - cache->start + 2 * alignment, alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, + cache->start + 2 * alignment, alignment); if (ret) { test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, - cache->start + alignment, alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, + cache->start + alignment, alignment); if (ret) { test_err("could not add free space"); return ret; @@ -383,29 +380,29 @@ static int test_merge_none(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, cache, path, - cache->start, cache->length); + ret = __btrfs_remove_from_free_space_tree(trans, cache, path, + cache->start, cache->length); if (ret) { test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, cache->start, - alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, cache->start, + alignment); if (ret) { test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, - cache->start + 4 * alignment, alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, + cache->start + 4 * alignment, alignment); if (ret) { test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, - cache->start + 2 * alignment, alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, + cache->start + 2 * alignment, alignment); if (ret) { test_err("could not add free space"); return ret; @@ -483,14 +480,14 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, goto out; } - ret = add_block_group_free_space(&trans, cache); + ret = btrfs_add_block_group_free_space(&trans, cache); if (ret) { test_err("could not add block group free space"); goto out; } if (bitmaps) { - ret = convert_free_space_to_bitmaps(&trans, cache, path); + ret = btrfs_convert_free_space_to_bitmaps(&trans, cache, path); if (ret) { test_err("could not convert block group to bitmaps"); goto out; @@ -501,7 +498,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, if (ret) goto out; - ret = remove_block_group_free_space(&trans, cache); + ret = btrfs_remove_block_group_free_space(&trans, cache); if (ret) { test_err("could not remove block group free space"); goto out; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 3ea3bc2225fe..a4c2b7748b95 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -268,7 +268,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("expected a hole, got %llu", em->disk_bytenr); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); /* @@ -314,7 +314,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) * this? */ offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -336,7 +336,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* Regular extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -363,7 +363,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* The next 3 are split extents */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -389,10 +389,10 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - disk_bytenr = extent_map_block_start(em); + disk_bytenr = btrfs_extent_map_block_start(em); orig_start = em->start; offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -414,7 +414,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -441,13 +441,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } disk_bytenr += (em->start - orig_start); - if (extent_map_block_start(em) != disk_bytenr) { + if (btrfs_extent_map_block_start(em) != disk_bytenr) { test_err("wrong block start, want %llu, have %llu", - disk_bytenr, extent_map_block_start(em)); + disk_bytenr, btrfs_extent_map_block_start(em)); goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* Prealloc extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -475,7 +475,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* The next 3 are a half written prealloc extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -502,10 +502,10 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - disk_bytenr = extent_map_block_start(em); + disk_bytenr = btrfs_extent_map_block_start(em); orig_start = em->start; offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -531,13 +531,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em->start - orig_start, em->offset); goto out; } - if (extent_map_block_start(em) != disk_bytenr + em->offset) { + if (btrfs_extent_map_block_start(em) != disk_bytenr + em->offset) { test_err("unexpected block start, wanted %llu, have %llu", - disk_bytenr + em->offset, extent_map_block_start(em)); + disk_bytenr + em->offset, btrfs_extent_map_block_start(em)); goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -564,13 +564,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em->start, em->offset, orig_start); goto out; } - if (extent_map_block_start(em) != disk_bytenr + em->offset) { + if (btrfs_extent_map_block_start(em) != disk_bytenr + em->offset) { test_err("unexpected block start, wanted %llu, have %llu", - disk_bytenr + em->offset, extent_map_block_start(em)); + disk_bytenr + em->offset, btrfs_extent_map_block_start(em)); goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* Now for the compressed extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -597,13 +597,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { + if (btrfs_extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { test_err("unexpected compress type, wanted %d, got %d", - BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); + BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em)); goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* Split compressed extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -630,15 +630,15 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { + if (btrfs_extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { test_err("unexpected compress type, wanted %d, got %d", - BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); + BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em)); goto out; } - disk_bytenr = extent_map_block_start(em); + disk_bytenr = btrfs_extent_map_block_start(em); orig_start = em->start; offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -664,16 +664,16 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (extent_map_block_start(em) != disk_bytenr) { + if (btrfs_extent_map_block_start(em) != disk_bytenr) { test_err("block start does not match, want %llu got %llu", - disk_bytenr, extent_map_block_start(em)); + disk_bytenr, btrfs_extent_map_block_start(em)); goto out; } if (em->start != offset || em->len != 2 * sectorsize) { @@ -692,13 +692,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em->start, em->offset, orig_start); goto out; } - if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { + if (btrfs_extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { test_err("unexpected compress type, wanted %d, got %d", - BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); + BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em)); goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* A hole between regular extents but no hole extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset + 6, sectorsize); @@ -725,7 +725,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, SZ_4M); if (IS_ERR(em)) { @@ -757,7 +757,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -785,7 +785,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) ret = 0; out: if (!IS_ERR(em)) - free_extent_map(em); + btrfs_free_extent_map(em); iput(inode); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); @@ -858,15 +858,16 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) em->flags); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, sectorsize, 2 * sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (extent_map_block_start(em) != sectorsize) { - test_err("expected a real extent, got %llu", extent_map_block_start(em)); + if (btrfs_extent_map_block_start(em) != sectorsize) { + test_err("expected a real extent, got %llu", + btrfs_extent_map_block_start(em)); goto out; } if (em->start != sectorsize || em->len != sectorsize) { @@ -883,7 +884,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) ret = 0; out: if (!IS_ERR(em)) - free_extent_map(em); + btrfs_free_extent_map(em); iput(inode); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); @@ -949,11 +950,10 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* [BTRFS_MAX_EXTENT_SIZE/2][sectorsize HOLE][the rest] */ - ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, - BTRFS_MAX_EXTENT_SIZE >> 1, - (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, NULL); + ret = btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE >> 1, + (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1017,11 +1017,10 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */ - ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, - BTRFS_MAX_EXTENT_SIZE + sectorsize, - BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, NULL); + ret = btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1052,9 +1051,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* Empty */ - ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, NULL); + ret = btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1068,9 +1066,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = 0; out: if (ret) - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, NULL); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL); iput(inode); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c index 30f17eb7b6a8..a7bc58a5c1e2 100644 --- a/fs/btrfs/tests/raid-stripe-tree-tests.c +++ b/fs/btrfs/tests/raid-stripe-tree-tests.c @@ -14,6 +14,8 @@ #define RST_TEST_NUM_DEVICES (2) #define RST_TEST_RAID1_TYPE (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_RAID1) +#define SZ_48K (SZ_32K + SZ_16K) + typedef int (*test_func_t)(struct btrfs_trans_handle *trans); static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_devices, @@ -30,6 +32,613 @@ static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_de } /* + * Test creating a range of three extents and then punch a hole in the middle, + * deleting all of the middle extents and partially deleting the "book ends". + */ +static int test_punch_hole_3extents(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical1 = SZ_1M; + u64 len1 = SZ_1M; + u64 logical2 = logical1 + len1; + u64 len2 = SZ_1M; + u64 logical3 = logical2 + len2; + u64 len3 = SZ_1M; + u64 hole_start = logical1 + SZ_256K; + u64 hole_len = SZ_2M; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + + /* Prepare for the test, 1st create 3 x 1M extents. */ + bioc->map_type = map_type; + bioc->size = len1; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical1 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical2; + bioc->size = len2; + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical2 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical3; + bioc->size = len3; + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical3 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + /* + * Delete a range starting at logical1 + 256K and 2M in length. Extent + * 1 is truncated to 256k length, extent 2 is completely dropped and + * extent 3 is moved 256K to the right. + */ + ret = btrfs_delete_raid_extent(trans, hole_start, hole_len); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + hole_start, hole_start + hole_len); + goto out; + } + + /* Get the first extent and check its size. */ + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical1, logical1 + len1); + goto out; + } + + if (io_stripe.physical != logical1) { + test_err("invalid physical address, expected %llu, got %llu", + logical1, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len1 != SZ_256K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_256K, len1); + ret = -EINVAL; + goto out; + } + + /* Get the second extent and check it's absent. */ + ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type, + 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID extent [%llu, %llu] succeeded should fail", + logical2, logical2 + len2); + ret = -EINVAL; + goto out; + } + + /* Get the third extent and check its size. */ + logical3 += SZ_256K; + ret = btrfs_get_raid_extent_offset(fs_info, logical3, &len3, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical3, logical3 + len3); + goto out; + } + + if (io_stripe.physical != logical3) { + test_err("invalid physical address, expected %llu, got %llu", + logical3 + SZ_256K, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len3 != SZ_1M - SZ_256K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_1M - SZ_256K, len3); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical1, len1); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical1, logical1 + len1); + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical3, len3); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical1, logical1 + len1); + goto out; + } + +out: + btrfs_put_bioc(bioc); + return ret; +} + +static int test_delete_two_extents(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical1 = SZ_1M; + u64 len1 = SZ_1M; + u64 logical2 = logical1 + len1; + u64 len2 = SZ_1M; + u64 logical3 = logical2 + len2; + u64 len3 = SZ_1M; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + + /* Prepare for the test, 1st create 3 x 1M extents. */ + bioc->map_type = map_type; + bioc->size = len1; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical1 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical2; + bioc->size = len2; + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical2 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical3; + bioc->size = len3; + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical3 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + /* + * Delete a range starting at logical1 and 2M in length. Extents 1 + * and 2 are dropped and extent 3 is kept as is. + */ + ret = btrfs_delete_raid_extent(trans, logical1, len1 + len2); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical1, logical1 + len1 + len2); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type, + 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", + logical1, len1); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type, + 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", + logical2, len2); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical3, &len3, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical3, len3); + goto out; + } + + if (io_stripe.physical != logical3) { + test_err("invalid physical address, expected %llu, got %llu", + logical3, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len3 != SZ_1M) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_1M, len3); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical3, len3); +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* Test punching a hole into a single RAID stripe-extent. */ +static int test_punch_hole(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical1 = SZ_1M; + u64 hole_start = logical1 + SZ_32K; + u64 hole_len = SZ_64K; + u64 logical2 = hole_start + hole_len; + u64 len = SZ_1M; + u64 len1 = SZ_32K; + u64 len2 = len - len1 - hole_len; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical1 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len, map_type, 0, + &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical1, + logical1 + len); + goto out; + } + + if (io_stripe.physical != logical1) { + test_err("invalid physical address, expected %llu got %llu", + logical1, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_1M) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_1M, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, hole_start, hole_len); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + hole_start, hole_start + hole_len); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical1, logical1 + len1); + goto out; + } + + if (io_stripe.physical != logical1) { + test_err("invalid physical address, expected %llu, got %llu", + logical1, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len1 != SZ_32K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_32K, len1); + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical2, + logical2 + len2); + goto out; + } + + if (io_stripe.physical != logical2) { + test_err("invalid physical address, expected %llu, got %llu", + logical2, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len2 != len - len1 - hole_len) { + test_err("invalid length, expected %llu, got %llu", + len - len1 - hole_len, len2); + ret = -EINVAL; + goto out; + } + + /* Check for the absence of the hole. */ + ret = btrfs_get_raid_extent_offset(fs_info, hole_start, &hole_len, + map_type, 0, &io_stripe); + if (ret != -ENODATA) { + ret = -EINVAL; + test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", + hole_start, hole_start + SZ_64K); + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical1, len1); + if (ret) + goto out; + + ret = btrfs_delete_raid_extent(trans, logical2, len2); +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* + * Test a 1M RST write that spans two adjacent RST items on disk and then + * delete a portion starting in the first item and spanning into the second + * item. This is similar to test_front_delete(), but spanning multiple items. + */ +static int test_front_delete_prev_item(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical1 = SZ_1M; + u64 logical2 = SZ_2M; + u64 len = SZ_1M; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + /* Insert RAID extent 1. */ + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical1 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical2; + /* Insert RAID extent 2, directly adjacent to it. */ + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical2 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical1 + SZ_512K, SZ_1M); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical1 + SZ_512K, (u64)SZ_1M); + goto out; + } + + /* Verify item 1 is truncated to 512K. */ + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len, map_type, 0, + &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical1, + logical1 + len); + goto out; + } + + if (io_stripe.physical != logical1) { + test_err("invalid physical address, expected %llu got %llu", + logical1, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_512K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_512K, len); + ret = -EINVAL; + goto out; + } + + /* Verify item 2's start is moved by 512K. */ + ret = btrfs_get_raid_extent_offset(fs_info, logical2 + SZ_512K, &len, + map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical2 + SZ_512K, logical2 + len); + goto out; + } + + if (io_stripe.physical != logical2 + SZ_512K) { + test_err("invalid physical address, expected %llu got %llu", + logical2 + SZ_512K, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_512K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_512K, len); + ret = -EINVAL; + goto out; + } + + /* Verify there's a hole at [1M+512K, 2M+512K] . */ + len = SZ_1M; + ret = btrfs_get_raid_extent_offset(fs_info, logical1 + SZ_512K, &len, + map_type, 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID [%llu, %llu] succeeded, should fail", + logical1 + SZ_512K, logical1 + SZ_512K + len); + goto out; + } + + /* Clean up after us. */ + ret = btrfs_delete_raid_extent(trans, logical1, SZ_512K); + if (ret) + goto out; + + ret = btrfs_delete_raid_extent(trans, logical2 + SZ_512K, SZ_512K); + +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then * delete the 1st 32K, making the new start address 1M+32K. */ @@ -94,45 +703,45 @@ static int test_front_delete(struct btrfs_trans_handle *trans) goto out; } - ret = btrfs_delete_raid_extent(trans, logical, SZ_32K); + ret = btrfs_delete_raid_extent(trans, logical, SZ_16K); if (ret) { test_err("deleting RAID extent [%llu, %llu] failed", logical, - logical + SZ_32K); + logical + SZ_16K); goto out; } - len = SZ_32K; - ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_32K, &len, + len -= SZ_16K; + ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_16K, &len, map_type, 0, &io_stripe); if (ret) { test_err("lookup of RAID extent [%llu, %llu] failed", - logical + SZ_32K, logical + SZ_32K + len); + logical + SZ_16K, logical + SZ_64K); goto out; } - if (io_stripe.physical != logical + SZ_32K) { + if (io_stripe.physical != logical + SZ_16K) { test_err("invalid physical address, expected %llu, got %llu", - logical + SZ_32K, io_stripe.physical); + logical + SZ_16K, io_stripe.physical); ret = -EINVAL; goto out; } - if (len != SZ_32K) { + if (len != SZ_48K) { test_err("invalid stripe length, expected %llu, got %llu", - (u64)SZ_32K, len); + (u64)SZ_48K, len); ret = -EINVAL; goto out; } ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); - if (!ret) { + if (ret != -ENODATA) { ret = -EINVAL; test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", - logical, logical + SZ_32K); + logical, logical + SZ_16K); goto out; } - ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K); + ret = btrfs_delete_raid_extent(trans, logical + SZ_16K, SZ_48K); out: btrfs_put_bioc(bioc); return ret; @@ -209,14 +818,14 @@ static int test_tail_delete(struct btrfs_trans_handle *trans) goto out; } - ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K); + ret = btrfs_delete_raid_extent(trans, logical + SZ_48K, SZ_16K); if (ret) { test_err("deleting RAID extent [%llu, %llu] failed", - logical + SZ_32K, logical + SZ_64K); + logical + SZ_48K, logical + SZ_64K); goto out; } - len = SZ_32K; + len = SZ_48K; ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); if (ret) { test_err("lookup of RAID extent [%llu, %llu] failed", logical, @@ -231,9 +840,19 @@ static int test_tail_delete(struct btrfs_trans_handle *trans) goto out; } - if (len != SZ_32K) { + if (len != SZ_48K) { test_err("invalid stripe length, expected %llu, got %llu", - (u64)SZ_32K, len); + (u64)SZ_48K, len); + ret = -EINVAL; + goto out; + } + + len = SZ_16K; + ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_48K, &len, + map_type, 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID extent [%llu, %llu] succeeded should fail", + logical + SZ_48K, logical + SZ_64K); ret = -EINVAL; goto out; } @@ -456,6 +1075,10 @@ static const test_func_t tests[] = { test_create_update_delete, test_tail_delete, test_front_delete, + test_front_delete_prev_item, + test_punch_hole, + test_punch_hole_3extents, + test_delete_two_extents, }; static int run_test(test_func_t test, u32 sectorsize, u32 nodesize) @@ -478,8 +1101,8 @@ static int run_test(test_func_t test, u32 sectorsize, u32 nodesize) ret = PTR_ERR(root); goto out; } - btrfs_set_super_compat_ro_flags(root->fs_info->super_copy, - BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE); + btrfs_set_super_incompat_flags(root->fs_info->super_copy, + BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE); root->root_key.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = 0; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index dc0b837efd5d..c5c0d9cf1a80 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -160,7 +160,13 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) cache = list_first_entry(&transaction->deleted_bgs, struct btrfs_block_group, bg_list); + /* + * Not strictly necessary to lock, as no other task will be using a + * block_group on the deleted_bgs list during a transaction abort. + */ + spin_lock(&transaction->fs_info->unused_bgs_lock); list_del_init(&cache->bg_list); + spin_unlock(&transaction->fs_info->unused_bgs_lock); btrfs_unfreeze_block_group(cache); btrfs_put_block_group(cache); } @@ -191,7 +197,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) list_del_init(&root->dirty_list); free_extent_buffer(root->commit_root); root->commit_root = btrfs_root_node(root); - extent_io_tree_release(&root->dirty_log_pages); + btrfs_extent_io_tree_release(&root->dirty_log_pages); btrfs_qgroup_clean_swapped_blocks(root); } @@ -274,8 +280,10 @@ loop: cur_trans = fs_info->running_transaction; if (cur_trans) { if (TRANS_ABORTED(cur_trans)) { + const int abort_error = cur_trans->aborted; + spin_unlock(&fs_info->trans_lock); - return cur_trans->aborted; + return abort_error; } if (btrfs_blocked_trans_types[cur_trans->state] & type) { spin_unlock(&fs_info->trans_lock); @@ -375,10 +383,10 @@ loop: INIT_LIST_HEAD(&cur_trans->deleted_bgs); spin_lock_init(&cur_trans->dropped_roots_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); - extent_io_tree_init(fs_info, &cur_trans->dirty_pages, - IO_TREE_TRANS_DIRTY_PAGES); - extent_io_tree_init(fs_info, &cur_trans->pinned_extents, - IO_TREE_FS_PINNED_EXTENTS); + btrfs_extent_io_tree_init(fs_info, &cur_trans->dirty_pages, + IO_TREE_TRANS_DIRTY_PAGES); + btrfs_extent_io_tree_init(fs_info, &cur_trans->pinned_extents, + IO_TREE_FS_PINNED_EXTENTS); btrfs_set_fs_generation(fs_info, fs_info->generation + 1); cur_trans->transid = fs_info->generation; fs_info->running_transaction = cur_trans; @@ -530,15 +538,15 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info) } } -static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type) +static bool may_wait_transaction(struct btrfs_fs_info *fs_info, int type) { if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) - return 0; + return false; if (type == TRANS_START) - return 1; + return true; - return 0; + return false; } static inline bool need_reserve_reloc_root(struct btrfs_root *root) @@ -753,9 +761,10 @@ got_it: * value here. */ if (do_chunk_alloc && num_bytes) { - u64 flags = h->block_rsv->space_info->flags; + struct btrfs_space_info *space_info = h->block_rsv->space_info; + u64 flags = space_info->flags; - btrfs_chunk_alloc(h, btrfs_get_alloc_profile(fs_info, flags), + btrfs_chunk_alloc(h, space_info, btrfs_get_alloc_profile(fs_info, flags), CHUNK_ALLOC_NO_FORCE); } @@ -795,8 +804,7 @@ alloc_fail: if (num_bytes) btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL); if (delayed_refs_bytes) - btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info, - delayed_refs_bytes); + btrfs_space_info_free_bytes_may_use(trans_rsv->space_info, delayed_refs_bytes); reserve_fail: btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); return ERR_PTR(ret); @@ -1121,13 +1129,13 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, u64 start = 0; u64 end; - while (find_first_extent_bit(dirty_pages, start, &start, &end, - mark, &cached_state)) { + while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end, + mark, &cached_state)) { bool wait_writeback = false; - ret = convert_extent_bit(dirty_pages, start, end, - EXTENT_NEED_WAIT, - mark, &cached_state); + ret = btrfs_convert_extent_bit(dirty_pages, start, end, + EXTENT_NEED_WAIT, + mark, &cached_state); /* * convert_extent_bit can return -ENOMEM, which is most of the * time a temporary error. So when it happens, ignore the error @@ -1148,8 +1156,8 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, if (!ret) ret = filemap_fdatawrite_range(mapping, start, end); if (!ret && wait_writeback) - ret = filemap_fdatawait_range(mapping, start, end); - free_extent_state(cached_state); + btrfs_btree_wait_writeback_range(fs_info, start, end); + btrfs_free_extent_state(cached_state); if (ret) break; cached_state = NULL; @@ -1168,14 +1176,13 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages) { - struct address_space *mapping = fs_info->btree_inode->i_mapping; struct extent_state *cached_state = NULL; u64 start = 0; u64 end; int ret = 0; - while (find_first_extent_bit(dirty_pages, start, &start, &end, - EXTENT_NEED_WAIT, &cached_state)) { + while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end, + EXTENT_NEED_WAIT, &cached_state)) { /* * Ignore -ENOMEM errors returned by clear_extent_bit(). * When committing the transaction, we'll remove any entries @@ -1184,13 +1191,13 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, * concurrently - we do it only at transaction commit time when * it's safe to do it (through extent_io_tree_release()). */ - ret = clear_extent_bit(dirty_pages, start, end, - EXTENT_NEED_WAIT, &cached_state); + ret = btrfs_clear_extent_bit(dirty_pages, start, end, + EXTENT_NEED_WAIT, &cached_state); if (ret == -ENOMEM) ret = 0; if (!ret) - ret = filemap_fdatawait_range(mapping, start, end); - free_extent_state(cached_state); + btrfs_btree_wait_writeback_range(fs_info, start, end); + btrfs_free_extent_state(cached_state); if (ret) break; cached_state = NULL; @@ -1204,15 +1211,15 @@ static int btrfs_wait_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages) { bool errors = false; - int err; + int ret; - err = __btrfs_wait_marked_extents(fs_info, dirty_pages); + ret = __btrfs_wait_marked_extents(fs_info, dirty_pages); if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags)) errors = true; - if (errors && !err) - err = -EIO; - return err; + if (errors && !ret) + ret = -EIO; + return ret; } int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark) @@ -1220,22 +1227,22 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark) struct btrfs_fs_info *fs_info = log_root->fs_info; struct extent_io_tree *dirty_pages = &log_root->dirty_log_pages; bool errors = false; - int err; + int ret; ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID); - err = __btrfs_wait_marked_extents(fs_info, dirty_pages); - if ((mark & EXTENT_DIRTY) && + ret = __btrfs_wait_marked_extents(fs_info, dirty_pages); + if ((mark & EXTENT_DIRTY_LOG1) && test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags)) errors = true; - if ((mark & EXTENT_NEW) && + if ((mark & EXTENT_DIRTY_LOG2) && test_and_clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags)) errors = true; - if (errors && !err) - err = -EIO; - return err; + if (errors && !ret) + ret = -EIO; + return ret; } /* @@ -1258,7 +1265,7 @@ static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans) blk_finish_plug(&plug); ret2 = btrfs_wait_extents(fs_info, dirty_pages); - extent_io_tree_release(&trans->transaction->dirty_pages); + btrfs_extent_io_tree_release(&trans->transaction->dirty_pages); if (ret) return ret; @@ -1320,7 +1327,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; struct list_head *dirty_bgs = &trans->transaction->dirty_bgs; struct list_head *io_bgs = &trans->transaction->io_bgs; - struct list_head *next; struct extent_buffer *eb; int ret; @@ -1356,13 +1362,13 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) again: while (!list_empty(&fs_info->dirty_cowonly_roots)) { struct btrfs_root *root; - next = fs_info->dirty_cowonly_roots.next; - list_del_init(next); - root = list_entry(next, struct btrfs_root, dirty_list); + + root = list_first_entry(&fs_info->dirty_cowonly_roots, + struct btrfs_root, dirty_list); clear_bit(BTRFS_ROOT_DIRTY, &root->state); + list_move_tail(&root->dirty_list, + &trans->transaction->switch_commits); - list_add_tail(&root->dirty_list, - &trans->transaction->switch_commits); ret = update_cowonly_root(trans, root); if (ret) return ret; @@ -1634,7 +1640,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *root = pending->root; struct btrfs_root *parent_root; struct btrfs_block_rsv *rsv; - struct inode *parent_inode = &pending->dir->vfs_inode; + struct btrfs_inode *parent_inode = pending->dir; struct btrfs_path *path; struct btrfs_dir_item *dir_item; struct extent_buffer *tmp; @@ -1660,7 +1666,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * filesystem. */ nofs_flags = memalloc_nofs_save(); - pending->error = fscrypt_setup_filename(parent_inode, + pending->error = fscrypt_setup_filename(&parent_inode->vfs_inode, &pending->dentry->d_name, 0, &fname); memalloc_nofs_restore(nofs_flags); @@ -1689,8 +1695,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } key.objectid = objectid; - key.offset = (u64)-1; key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; rsv = trans->block_rsv; trans->block_rsv = &pending->block_rsv; @@ -1698,16 +1704,16 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, trace_btrfs_space_reservation(fs_info, "transaction", trans->transid, trans->bytes_reserved, 1); - parent_root = BTRFS_I(parent_inode)->root; + parent_root = parent_inode->root; ret = record_root_in_trans(trans, parent_root, 0); if (ret) goto fail; - cur_time = current_time(parent_inode); + cur_time = current_time(&parent_inode->vfs_inode); /* * insert the directory item */ - ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index); + ret = btrfs_set_inode_index(parent_inode, &index); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1715,7 +1721,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* check if there is a file/dir which has the same name. */ dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, - btrfs_ino(BTRFS_I(parent_inode)), + btrfs_ino(parent_inode), &fname.disk_name, 0); if (dir_item != NULL && !IS_ERR(dir_item)) { pending->error = -EEXIST; @@ -1729,8 +1735,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_create_qgroup(trans, objectid); if (ret && ret != -EEXIST) { - btrfs_abort_transaction(trans, ret); - goto fail; + if (ret != -ENOTCONN || btrfs_qgroup_enabled(fs_info)) { + btrfs_abort_transaction(trans, ret); + goto fail; + } } /* @@ -1816,7 +1824,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, */ ret = btrfs_add_root_ref(trans, objectid, btrfs_root_id(parent_root), - btrfs_ino(BTRFS_I(parent_inode)), index, + btrfs_ino(parent_inode), index, &fname.disk_name); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1854,18 +1862,18 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; ret = btrfs_insert_dir_item(trans, &fname.disk_name, - BTRFS_I(parent_inode), &key, BTRFS_FT_DIR, + parent_inode, &key, BTRFS_FT_DIR, index); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } - btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + + btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + fname.disk_name.len * 2); - inode_set_mtime_to_ts(parent_inode, - inode_set_ctime_current(parent_inode)); - ret = btrfs_update_inode_fallback(trans, BTRFS_I(parent_inode)); + inode_set_mtime_to_ts(&parent_inode->vfs_inode, + inode_set_ctime_current(&parent_inode->vfs_inode)); + ret = btrfs_update_inode_fallback(trans, parent_inode); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -2095,7 +2103,14 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); + /* + * Not strictly necessary to lock, as no other task will be using a + * block_group on the new_bgs list during a transaction abort. + */ + spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); + btrfs_put_block_group(block_group); + spin_unlock(&fs_info->unused_bgs_lock); } } @@ -2150,13 +2165,19 @@ static void add_pending_snapshot(struct btrfs_trans_handle *trans) list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots); } -static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval) +static void update_commit_stats(struct btrfs_fs_info *fs_info) { + ktime_t now = ktime_get_ns(); + ktime_t interval = now - fs_info->commit_stats.critical_section_start_time; + + ASSERT(fs_info->commit_stats.critical_section_start_time); + fs_info->commit_stats.commit_count++; fs_info->commit_stats.last_commit_dur = interval; fs_info->commit_stats.max_commit_dur = max_t(u64, fs_info->commit_stats.max_commit_dur, interval); fs_info->commit_stats.total_commit_dur += interval; + fs_info->commit_stats.critical_section_start_time = 0; } int btrfs_commit_transaction(struct btrfs_trans_handle *trans) @@ -2165,8 +2186,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; int ret; - ktime_t start_time; - ktime_t interval; ASSERT(refcount_read(&trans->use_count) == 1); btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); @@ -2257,14 +2276,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wake_up(&fs_info->transaction_blocked_wait); btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); - if (cur_trans->list.prev != &fs_info->trans_list) { + if (!list_is_first(&cur_trans->list, &fs_info->trans_list)) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; if (trans->in_fsync) want_state = TRANS_STATE_SUPER_COMMITTED; - prev_trans = list_entry(cur_trans->list.prev, - struct btrfs_transaction, list); + prev_trans = list_prev_entry(cur_trans, list); if (prev_trans->state < want_state) { refcount_inc(&prev_trans->use_count); spin_unlock(&fs_info->trans_lock); @@ -2300,8 +2318,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * Get the time spent on the work done by the commit thread and not * the time spent waiting on a previous commit */ - start_time = ktime_get_ns(); - + fs_info->commit_stats.critical_section_start_time = ktime_get_ns(); extwriter_counter_dec(cur_trans, trans->type); ret = btrfs_start_delalloc_flush(fs_info); @@ -2533,6 +2550,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) goto scrub_continue; + update_commit_stats(fs_info); /* * We needn't acquire the lock here because there is no other task * which can change it. @@ -2541,7 +2559,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wake_up(&cur_trans->commit_wait); btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); - btrfs_finish_extent_commit(trans); + ret = btrfs_finish_extent_commit(trans); + if (ret) + goto scrub_continue; if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) btrfs_clear_space_info_full(fs_info); @@ -2567,8 +2587,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) trace_btrfs_transaction_commit(fs_info); - interval = ktime_get_ns() - start_time; - btrfs_scrub_continue(fs_info); if (current->journal_info == trans) @@ -2576,8 +2594,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) kmem_cache_free(btrfs_trans_handle_cachep, trans); - update_commit_stats(fs_info, interval); - return ret; unlock_reloc: diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 184fa5c0062a..9f7c777af635 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -227,7 +227,21 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) delayed_refs->qgroup_to_skip = 0; } -bool __cold abort_should_print_stack(int error); +/* + * We want the transaction abort to print stack trace only for errors where the + * cause could be a bug, eg. due to ENOSPC, and not for common errors that are + * caused by external factors. + */ +static inline bool btrfs_abort_should_print_stack(int error) +{ + switch (error) { + case -EIO: + case -EROFS: + case -ENOMEM: + return false; + } + return true; +} /* * Call btrfs_abort_transaction as early as possible when an error condition is @@ -240,7 +254,7 @@ do { \ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ &((trans)->fs_info->fs_state))) { \ __first = true; \ - if (WARN(abort_should_print_stack(error), \ + if (WARN(btrfs_abort_should_print_stack(error), \ KERN_ERR \ "BTRFS: Transaction aborted (error %d)\n", \ (error))) { \ diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 148d8cefa40e..0f556f4de3f9 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -191,7 +191,7 @@ static bool check_prev_ino(struct extent_buffer *leaf, * Only subvolume trees along with their reloc trees need this check. * Things like log tree doesn't follow this ino requirement. */ - if (!is_fstree(btrfs_header_owner(leaf))) + if (!btrfs_is_fstree(btrfs_header_owner(leaf))) return true; if (key->objectid == prev_key->objectid) @@ -475,7 +475,7 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key, * to be COWed to be relocated. */ if (unlikely(is_root_item && key->objectid == BTRFS_TREE_RELOC_OBJECTID && - !is_fstree(key->offset))) { + !btrfs_is_fstree(key->offset))) { generic_err(leaf, slot, "invalid reloc tree for root %lld, root id is not a subvolume tree", key->offset); @@ -493,7 +493,7 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key, } /* DIR_ITEM/INDEX/INODE_REF is not allowed to point to non-fs trees */ - if (unlikely(!is_fstree(key->objectid) && !is_root_item)) { + if (unlikely(!btrfs_is_fstree(key->objectid) && !is_root_item)) { dir_item_err(leaf, slot, "invalid location key objectid, have %llu expect [%llu, %llu]", key->objectid, BTRFS_FIRST_FREE_OBJECTID, @@ -764,22 +764,19 @@ static int check_block_group_item(struct extent_buffer *leaf, return 0; } -__printf(4, 5) +__printf(5, 6) __cold -static void chunk_err(const struct extent_buffer *leaf, +static void chunk_err(const struct btrfs_fs_info *fs_info, + const struct extent_buffer *leaf, const struct btrfs_chunk *chunk, u64 logical, const char *fmt, ...) { - const struct btrfs_fs_info *fs_info = leaf->fs_info; - bool is_sb; + bool is_sb = !leaf; struct va_format vaf; va_list args; int i; int slot = -1; - /* Only superblock eb is able to have such small offset */ - is_sb = (leaf->start == BTRFS_SUPER_INFO_OFFSET); - if (!is_sb) { /* * Get the slot number by iterating through all slots, this @@ -812,13 +809,17 @@ static void chunk_err(const struct extent_buffer *leaf, /* * The common chunk check which could also work on super block sys chunk array. * + * If @leaf is NULL, then @chunk must be an on-stack chunk item. + * (For superblock sys_chunk array, and fs_info->sectorsize is unreliable) + * * Return -EUCLEAN if anything is corrupted. * Return 0 if everything is OK. */ -int btrfs_check_chunk_valid(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, u64 logical) +int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info, + const struct extent_buffer *leaf, + const struct btrfs_chunk *chunk, u64 logical, + u32 sectorsize) { - struct btrfs_fs_info *fs_info = leaf->fs_info; u64 length; u64 chunk_end; u64 stripe_len; @@ -826,63 +827,73 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, u16 sub_stripes; u64 type; u64 features; + u32 chunk_sector_size; bool mixed = false; int raid_index; int nparity; int ncopies; - length = btrfs_chunk_length(leaf, chunk); - stripe_len = btrfs_chunk_stripe_len(leaf, chunk); - num_stripes = btrfs_chunk_num_stripes(leaf, chunk); - sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); - type = btrfs_chunk_type(leaf, chunk); + if (leaf) { + length = btrfs_chunk_length(leaf, chunk); + stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + type = btrfs_chunk_type(leaf, chunk); + chunk_sector_size = btrfs_chunk_sector_size(leaf, chunk); + } else { + length = btrfs_stack_chunk_length(chunk); + stripe_len = btrfs_stack_chunk_stripe_len(chunk); + num_stripes = btrfs_stack_chunk_num_stripes(chunk); + sub_stripes = btrfs_stack_chunk_sub_stripes(chunk); + type = btrfs_stack_chunk_type(chunk); + chunk_sector_size = btrfs_stack_chunk_sector_size(chunk); + } raid_index = btrfs_bg_flags_to_raid_index(type); ncopies = btrfs_raid_array[raid_index].ncopies; nparity = btrfs_raid_array[raid_index].nparity; if (unlikely(!num_stripes)) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk num_stripes, have %u", num_stripes); return -EUCLEAN; } if (unlikely(num_stripes < ncopies)) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk num_stripes < ncopies, have %u < %d", num_stripes, ncopies); return -EUCLEAN; } if (unlikely(nparity && num_stripes == nparity)) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk num_stripes == nparity, have %u == %d", num_stripes, nparity); return -EUCLEAN; } - if (unlikely(!IS_ALIGNED(logical, fs_info->sectorsize))) { - chunk_err(leaf, chunk, logical, + if (unlikely(!IS_ALIGNED(logical, sectorsize))) { + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk logical, have %llu should aligned to %u", - logical, fs_info->sectorsize); + logical, sectorsize); return -EUCLEAN; } - if (unlikely(btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize)) { - chunk_err(leaf, chunk, logical, + if (unlikely(chunk_sector_size != sectorsize)) { + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk sectorsize, have %u expect %u", - btrfs_chunk_sector_size(leaf, chunk), - fs_info->sectorsize); + chunk_sector_size, sectorsize); return -EUCLEAN; } - if (unlikely(!length || !IS_ALIGNED(length, fs_info->sectorsize))) { - chunk_err(leaf, chunk, logical, + if (unlikely(!length || !IS_ALIGNED(length, sectorsize))) { + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk length, have %llu", length); return -EUCLEAN; } if (unlikely(check_add_overflow(logical, length, &chunk_end))) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk logical start and length, have logical start %llu length %llu", logical, length); return -EUCLEAN; } if (unlikely(!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN)) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk stripe length: %llu", stripe_len); return -EUCLEAN; @@ -896,30 +907,29 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, * Thus it should be a good way to catch obvious bitflips. */ if (unlikely(length >= btrfs_stripe_nr_to_offset(U32_MAX))) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "chunk length too large: have %llu limit %llu", length, btrfs_stripe_nr_to_offset(U32_MAX)); return -EUCLEAN; } if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK))) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "unrecognized chunk type: 0x%llx", ~(BTRFS_BLOCK_GROUP_TYPE_MASK | - BTRFS_BLOCK_GROUP_PROFILE_MASK) & - btrfs_chunk_type(leaf, chunk)); + BTRFS_BLOCK_GROUP_PROFILE_MASK) & type); return -EUCLEAN; } if (unlikely(!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0)) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set", type & BTRFS_BLOCK_GROUP_PROFILE_MASK); return -EUCLEAN; } if (unlikely((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0)) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "missing chunk type flag, have 0x%llx one bit must be set in 0x%llx", type, BTRFS_BLOCK_GROUP_TYPE_MASK); return -EUCLEAN; @@ -928,7 +938,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, if (unlikely((type & BTRFS_BLOCK_GROUP_SYSTEM) && (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA)))) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "system chunk with data or metadata type: 0x%llx", type); return -EUCLEAN; @@ -941,7 +951,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, if (!mixed) { if (unlikely((type & BTRFS_BLOCK_GROUP_METADATA) && (type & BTRFS_BLOCK_GROUP_DATA))) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "mixed chunk type in non-mixed mode: 0x%llx", type); return -EUCLEAN; } @@ -963,7 +973,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) || ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid num_stripes:sub_stripes %u:%u for profile %llu", num_stripes, sub_stripes, type & BTRFS_BLOCK_GROUP_PROFILE_MASK); @@ -983,14 +993,15 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf, struct btrfs_chunk *chunk, struct btrfs_key *key, int slot) { + struct btrfs_fs_info *fs_info = leaf->fs_info; int num_stripes; if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) { - chunk_err(leaf, chunk, key->offset, + chunk_err(fs_info, leaf, chunk, key->offset, "invalid chunk item size: have %u expect [%zu, %u)", btrfs_item_size(leaf, slot), sizeof(struct btrfs_chunk), - BTRFS_LEAF_DATA_SIZE(leaf->fs_info)); + BTRFS_LEAF_DATA_SIZE(fs_info)); return -EUCLEAN; } @@ -1001,14 +1012,15 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf, if (unlikely(btrfs_chunk_item_size(num_stripes) != btrfs_item_size(leaf, slot))) { - chunk_err(leaf, chunk, key->offset, + chunk_err(fs_info, leaf, chunk, key->offset, "invalid chunk item size: have %u expect %lu", btrfs_item_size(leaf, slot), btrfs_chunk_item_size(num_stripes)); return -EUCLEAN; } out: - return btrfs_check_chunk_valid(leaf, chunk, key->offset); + return btrfs_check_chunk_valid(fs_info, leaf, chunk, key->offset, + fs_info->sectorsize); } __printf(3, 4) @@ -1299,7 +1311,7 @@ static bool is_valid_dref_root(u64 rootid) * - tree root * For v1 space cache */ - return is_fstree(rootid) || rootid == BTRFS_DATA_RELOC_TREE_OBJECTID || + return btrfs_is_fstree(rootid) || rootid == BTRFS_DATA_RELOC_TREE_OBJECTID || rootid == BTRFS_ROOT_TREE_OBJECTID; } @@ -1527,6 +1539,11 @@ static int check_extent_item(struct extent_buffer *leaf, dref_offset, fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) { + extent_err(leaf, slot, + "invalid data ref count, should have non-zero value"); + return -EUCLEAN; + } inline_refs += btrfs_extent_data_ref_count(leaf, dref); break; /* Contains parent bytenr and ref count */ @@ -1539,6 +1556,11 @@ static int check_extent_item(struct extent_buffer *leaf, inline_offset, fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) { + extent_err(leaf, slot, + "invalid shared data ref count, should have non-zero value"); + return -EUCLEAN; + } inline_refs += btrfs_shared_data_ref_count(leaf, sref); break; case BTRFS_EXTENT_OWNER_REF_KEY: @@ -1549,7 +1571,7 @@ static int check_extent_item(struct extent_buffer *leaf, inline_type); return -EUCLEAN; } - if (inline_type < last_type) { + if (unlikely(inline_type < last_type)) { extent_err(leaf, slot, "inline ref out-of-order: has type %u, prev type %u", inline_type, last_type); @@ -1558,7 +1580,7 @@ static int check_extent_item(struct extent_buffer *leaf, /* Type changed, allow the sequence starts from U64_MAX again. */ if (inline_type > last_type) last_seq = U64_MAX; - if (seq > last_seq) { + if (unlikely(seq > last_seq)) { extent_err(leaf, slot, "inline ref out-of-order: has type %u offset %llu seq 0x%llx, prev type %u seq 0x%llx", inline_type, inline_offset, seq, @@ -1611,8 +1633,18 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf, { u32 expect_item_size = 0; - if (key->type == BTRFS_SHARED_DATA_REF_KEY) + if (key->type == BTRFS_SHARED_DATA_REF_KEY) { + struct btrfs_shared_data_ref *sref; + + sref = btrfs_item_ptr(leaf, slot, struct btrfs_shared_data_ref); + if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) { + extent_err(leaf, slot, + "invalid shared data backref count, should have non-zero value"); + return -EUCLEAN; + } + expect_item_size = sizeof(struct btrfs_shared_data_ref); + } if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) { generic_err(leaf, slot, @@ -1689,6 +1721,11 @@ static int check_extent_data_ref(struct extent_buffer *leaf, offset, leaf->fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) { + extent_err(leaf, slot, + "invalid extent data backref count, should have non-zero value"); + return -EUCLEAN; + } } return 0; } @@ -1892,7 +1929,7 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, break; } - if (ret) + if (unlikely(ret)) return BTRFS_TREE_BLOCK_INVALID_ITEM; return BTRFS_TREE_BLOCK_CLEAN; } @@ -2130,7 +2167,7 @@ ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO); int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) { - const bool is_subvol = is_fstree(root_owner); + const bool is_subvol = btrfs_is_fstree(root_owner); const u64 eb_owner = btrfs_header_owner(eb); /* @@ -2172,7 +2209,7 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) * For subvolume trees, owners can mismatch, but they should all belong * to subvolume trees. */ - if (unlikely(is_subvol != is_fstree(eb_owner))) { + if (unlikely(is_subvol != btrfs_is_fstree(eb_owner))) { btrfs_crit(eb->fs_info, "corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect [%llu, %llu]", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -2192,13 +2229,12 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int ret; found_level = btrfs_header_level(eb); - if (found_level != check->level) { - WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), - KERN_ERR "BTRFS: tree level check failed\n"); + if (unlikely(found_level != check->level)) { + DEBUG_WARN(); btrfs_err(fs_info, "tree level mismatch detected, bytenr=%llu level expected=%u has=%u", eb->start, check->level, found_level); - return -EIO; + return -EUCLEAN; } if (!check->has_first_key) @@ -2214,11 +2250,11 @@ int btrfs_verify_level_key(struct extent_buffer *eb, return 0; /* We have @first_key, so this @eb must have at least one item */ - if (btrfs_header_nritems(eb) == 0) { + if (unlikely(btrfs_header_nritems(eb) == 0)) { btrfs_err(fs_info, "invalid tree nritems, bytenr=%llu nritems=0 expect >0", eb->start); - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN(); return -EUCLEAN; } @@ -2226,11 +2262,10 @@ int btrfs_verify_level_key(struct extent_buffer *eb, btrfs_node_key_to_cpu(eb, &found_key, 0); else btrfs_item_key_to_cpu(eb, &found_key, 0); - ret = btrfs_comp_cpu_keys(&check->first_key, &found_key); - if (ret) { - WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), - KERN_ERR "BTRFS: tree first key check failed\n"); + ret = btrfs_comp_cpu_keys(&check->first_key, &found_key); + if (unlikely(ret)) { + DEBUG_WARN(); btrfs_err(fs_info, "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", eb->start, check->transid, check->first_key.objectid, diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index db67f96cbe4b..eb201f4ec3c7 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -10,6 +10,7 @@ #include <uapi/linux/btrfs_tree.h> struct extent_buffer; +struct btrfs_fs_info; struct btrfs_chunk; struct btrfs_key; @@ -66,8 +67,10 @@ enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node); int btrfs_check_leaf(struct extent_buffer *leaf); int btrfs_check_node(struct extent_buffer *node); -int btrfs_check_chunk_valid(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, u64 logical); +int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info, + const struct extent_buffer *leaf, + const struct btrfs_chunk *chunk, u64 logical, + u32 sectorsize); int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner); int btrfs_verify_level_key(struct extent_buffer *eb, const struct btrfs_tree_parent_check *check); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c8d6587688b3..9f05d454b9df 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -112,7 +112,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path, - u64 dirid, int del_all); + u64 dirid, bool del_all); static void wait_log_commit(struct btrfs_root *root, int transid); /* @@ -138,10 +138,13 @@ static void wait_log_commit(struct btrfs_root *root, int transid); * and once to do all the other items. */ -static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root) +static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root) { unsigned int nofs_flag; - struct inode *inode; + struct btrfs_inode *inode; + + /* Only meant to be called for subvolume roots and not for log roots. */ + ASSERT(btrfs_is_fstree(btrfs_root_id(root))); /* * We're holding a transaction handle whether we are logging or @@ -376,12 +379,12 @@ static int process_one_buffer(struct btrfs_root *log, } /* - * Item overwrite used by replay and tree logging. eb, slot and key all refer - * to the src data we are copying out. + * Item overwrite used by log replay. The given eb, slot and key all refer to + * the source data we are copying out. * - * root is the tree we are copying into, and path is a scratch - * path for use in this function (it should be released on entry and - * will be released on exit). + * The given root is for the tree we are copying into, and path is a scratch + * path for use in this function (it should be released on entry and will be + * released on exit). * * If the key is already in the destination tree the existing item is * overwritten. If the existing item isn't big enough, it is extended. @@ -401,6 +404,8 @@ static int overwrite_item(struct btrfs_trans_handle *trans, int save_old_i_size = 0; unsigned long src_ptr; unsigned long dst_ptr; + struct extent_buffer *dst_eb; + int dst_slot; bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; /* @@ -420,11 +425,13 @@ static int overwrite_item(struct btrfs_trans_handle *trans, if (ret < 0) return ret; + dst_eb = path->nodes[0]; + dst_slot = path->slots[0]; + if (ret == 0) { char *src_copy; - char *dst_copy; - u32 dst_size = btrfs_item_size(path->nodes[0], - path->slots[0]); + const u32 dst_size = btrfs_item_size(dst_eb, dst_slot); + if (dst_size != item_size) goto insert; @@ -432,23 +439,16 @@ static int overwrite_item(struct btrfs_trans_handle *trans, btrfs_release_path(path); return 0; } - dst_copy = kmalloc(item_size, GFP_NOFS); src_copy = kmalloc(item_size, GFP_NOFS); - if (!dst_copy || !src_copy) { + if (!src_copy) { btrfs_release_path(path); - kfree(dst_copy); - kfree(src_copy); return -ENOMEM; } read_extent_buffer(eb, src_copy, src_ptr, item_size); + dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); + ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size); - dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); - read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, - item_size); - ret = memcmp(dst_copy, src_copy, item_size); - - kfree(dst_copy); kfree(src_copy); /* * they have the same contents, just return, this saves @@ -470,9 +470,9 @@ static int overwrite_item(struct btrfs_trans_handle *trans, u64 nbytes; u32 mode; - item = btrfs_item_ptr(path->nodes[0], path->slots[0], + item = btrfs_item_ptr(dst_eb, dst_slot, struct btrfs_inode_item); - nbytes = btrfs_inode_nbytes(path->nodes[0], item); + nbytes = btrfs_inode_nbytes(dst_eb, item); item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); btrfs_set_inode_nbytes(eb, item, nbytes); @@ -514,11 +514,13 @@ insert: key, item_size); path->skip_release_on_error = 0; + dst_eb = path->nodes[0]; + dst_slot = path->slots[0]; + /* make sure any existing item is the correct size */ if (ret == -EEXIST || ret == -EOVERFLOW) { - u32 found_size; - found_size = btrfs_item_size(path->nodes[0], - path->slots[0]); + const u32 found_size = btrfs_item_size(dst_eb, dst_slot); + if (found_size > item_size) btrfs_truncate_item(trans, path, item_size, 1); else if (found_size < item_size) @@ -526,8 +528,7 @@ insert: } else if (ret) { return ret; } - dst_ptr = btrfs_item_ptr_offset(path->nodes[0], - path->slots[0]); + dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); /* don't overwrite an existing inode if the generation number * was logged as zero. This is done when the tree logging code @@ -546,7 +547,6 @@ insert: dst_item = (struct btrfs_inode_item *)dst_ptr; if (btrfs_inode_generation(eb, src_item) == 0) { - struct extent_buffer *dst_eb = path->nodes[0]; const u64 ino_size = btrfs_inode_size(eb, src_item); /* @@ -564,33 +564,30 @@ insert: } if (S_ISDIR(btrfs_inode_mode(eb, src_item)) && - S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { + S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) { save_old_i_size = 1; - saved_i_size = btrfs_inode_size(path->nodes[0], - dst_item); + saved_i_size = btrfs_inode_size(dst_eb, dst_item); } } - copy_extent_buffer(path->nodes[0], eb, dst_ptr, - src_ptr, item_size); + copy_extent_buffer(dst_eb, eb, dst_ptr, src_ptr, item_size); if (save_old_i_size) { struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; - btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); + btrfs_set_inode_size(dst_eb, dst_item, saved_i_size); } /* make sure the generation is filled in */ if (key->type == BTRFS_INODE_ITEM_KEY) { struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; - if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { - btrfs_set_inode_generation(path->nodes[0], dst_item, - trans->transid); - } + if (btrfs_inode_generation(dst_eb, dst_item) == 0) + btrfs_set_inode_generation(dst_eb, dst_item, trans->transid); } no_copy: - btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); return 0; } @@ -610,21 +607,6 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, return 0; } -/* - * simple helper to read an inode off the disk from a given root - * This can only be called for subvolume roots and not for the log - */ -static noinline struct inode *read_one_inode(struct btrfs_root *root, - u64 objectid) -{ - struct inode *inode; - - inode = btrfs_iget_logging(objectid, root); - if (IS_ERR(inode)) - inode = NULL; - return inode; -} - /* replays a single extent in 'eb' at 'slot' with 'key' into the * subvolume 'root'. path is released on entry and should be released * on exit. @@ -650,7 +632,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, u64 start = key->offset; u64 nbytes = 0; struct btrfs_file_extent_item *item; - struct inode *inode = NULL; + struct btrfs_inode *inode = NULL; unsigned long size; int ret = 0; @@ -674,46 +656,38 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, extent_end = ALIGN(start + size, fs_info->sectorsize); } else { - ret = 0; - goto out; + btrfs_err(fs_info, + "unexpected extent type=%d root=%llu inode=%llu offset=%llu", + found_type, btrfs_root_id(root), key->objectid, key->offset); + return -EUCLEAN; } - inode = read_one_inode(root, key->objectid); - if (!inode) { - ret = -EIO; - goto out; - } + inode = btrfs_iget_logging(key->objectid, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); /* * first check to see if we already have this extent in the * file. This must be done before the btrfs_drop_extents run * so we don't try to drop this extent. */ - ret = btrfs_lookup_file_extent(trans, root, path, - btrfs_ino(BTRFS_I(inode)), start, 0); + ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0); if (ret == 0 && (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC)) { - struct btrfs_file_extent_item cmp1; - struct btrfs_file_extent_item cmp2; - struct btrfs_file_extent_item *existing; - struct extent_buffer *leaf; - - leaf = path->nodes[0]; - existing = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); + struct btrfs_file_extent_item existing; + unsigned long ptr; - read_extent_buffer(eb, &cmp1, (unsigned long)item, - sizeof(cmp1)); - read_extent_buffer(leaf, &cmp2, (unsigned long)existing, - sizeof(cmp2)); + ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + read_extent_buffer(path->nodes[0], &existing, ptr, sizeof(existing)); /* * we already have a pointer to this exact extent, * we don't have to do anything */ - if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { + if (memcmp_extent_buffer(eb, &existing, (unsigned long)item, + sizeof(existing)) == 0) { btrfs_release_path(path); goto out; } @@ -724,7 +698,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, drop_args.start = start; drop_args.end = extent_end; drop_args.drop_cache = true; - ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) goto out; @@ -748,8 +722,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, (unsigned long)item, sizeof(*item)); ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); - ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); offset = key->offset - btrfs_file_extent_offset(eb, item); /* @@ -874,9 +848,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum *sums; struct btrfs_root *csum_root; - sums = list_entry(ordered_sums.next, - struct btrfs_ordered_sum, - list); + sums = list_first_entry(&ordered_sums, + struct btrfs_ordered_sum, + list); csum_root = btrfs_csum_root(fs_info, sums->logical); if (!ret) @@ -902,16 +876,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, goto out; } - ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, - extent_end - start); + ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start); if (ret) goto out; update_inode: - btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found); + ret = btrfs_update_inode(trans, inode); out: - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -948,7 +921,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di) { struct btrfs_root *root = dir->root; - struct inode *inode; + struct btrfs_inode *inode; struct fscrypt_str name; struct extent_buffer *leaf; struct btrfs_key location; @@ -963,9 +936,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, btrfs_release_path(path); - inode = read_one_inode(root, location.objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(location.objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; goto out; } @@ -973,10 +947,11 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name); + ret = unlink_inode_for_log_replay(trans, dir, inode, &name); out: kfree(name.name); - iput(inode); + if (inode) + iput(&inode->vfs_inode); return ret; } @@ -1066,6 +1041,126 @@ out: return ret; } +static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_root *log_root, + struct btrfs_key *search_key, + struct btrfs_inode *dir, + struct btrfs_inode *inode, + u64 parent_objectid) +{ + struct extent_buffer *leaf = path->nodes[0]; + unsigned long ptr; + unsigned long ptr_end; + + /* + * Check all the names in this back reference to see if they are in the + * log. If so, we allow them to stay otherwise they must be unlinked as + * a conflict. + */ + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); + while (ptr < ptr_end) { + struct fscrypt_str victim_name; + struct btrfs_inode_ref *victim_ref; + int ret; + + victim_ref = (struct btrfs_inode_ref *)ptr; + ret = read_alloc_one_name(leaf, (victim_ref + 1), + btrfs_inode_ref_name_len(leaf, victim_ref), + &victim_name); + if (ret) + return ret; + + ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name); + if (ret) { + kfree(victim_name.name); + if (ret < 0) + return ret; + ptr = (unsigned long)(victim_ref + 1) + victim_name.len; + continue; + } + + inc_nlink(&inode->vfs_inode); + btrfs_release_path(path); + + ret = unlink_inode_for_log_replay(trans, dir, inode, &victim_name); + kfree(victim_name.name); + if (ret) + return ret; + return -EAGAIN; + } + + return 0; +} + +static int unlink_extrefs_not_in_log(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_root *root, + struct btrfs_root *log_root, + struct btrfs_key *search_key, + struct btrfs_inode *inode, + u64 inode_objectid, + u64 parent_objectid) +{ + struct extent_buffer *leaf = path->nodes[0]; + const unsigned long base = btrfs_item_ptr_offset(leaf, path->slots[0]); + const u32 item_size = btrfs_item_size(leaf, path->slots[0]); + u32 cur_offset = 0; + + while (cur_offset < item_size) { + struct btrfs_inode_extref *extref; + struct btrfs_inode *victim_parent; + struct fscrypt_str victim_name; + int ret; + + extref = (struct btrfs_inode_extref *)(base + cur_offset); + victim_name.len = btrfs_inode_extref_name_len(leaf, extref); + + if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) + goto next; + + ret = read_alloc_one_name(leaf, &extref->name, victim_name.len, + &victim_name); + if (ret) + return ret; + + search_key->objectid = inode_objectid; + search_key->type = BTRFS_INODE_EXTREF_KEY; + search_key->offset = btrfs_extref_hash(parent_objectid, + victim_name.name, + victim_name.len); + ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name); + if (ret) { + kfree(victim_name.name); + if (ret < 0) + return ret; +next: + cur_offset += victim_name.len + sizeof(*extref); + continue; + } + + victim_parent = btrfs_iget_logging(parent_objectid, root); + if (IS_ERR(victim_parent)) { + kfree(victim_name.name); + return PTR_ERR(victim_parent); + } + + inc_nlink(&inode->vfs_inode); + btrfs_release_path(path); + + ret = unlink_inode_for_log_replay(trans, victim_parent, inode, + &victim_name); + iput(&victim_parent->vfs_inode); + kfree(victim_name.name); + if (ret) + return ret; + return -EAGAIN; + } + + return 0; +} + static inline int __add_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -1076,7 +1171,6 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, u64 ref_index, struct fscrypt_str *name) { int ret; - struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key search_key; struct btrfs_inode_extref *extref; @@ -1087,121 +1181,37 @@ again: search_key.type = BTRFS_INODE_REF_KEY; search_key.offset = parent_objectid; ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); - if (ret == 0) { - struct btrfs_inode_ref *victim_ref; - unsigned long ptr; - unsigned long ptr_end; - - leaf = path->nodes[0]; - - /* are we trying to overwrite a back ref for the root directory - * if so, just jump out, we're done + if (ret < 0) { + return ret; + } else if (ret == 0) { + /* + * Are we trying to overwrite a back ref for the root directory? + * If so, we're done. */ if (search_key.objectid == search_key.offset) return 1; - /* check all the names in this back reference to see - * if they are in the log. if so, we allow them to stay - * otherwise they must be unlinked as a conflict - */ - ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); - ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); - while (ptr < ptr_end) { - struct fscrypt_str victim_name; - - victim_ref = (struct btrfs_inode_ref *)ptr; - ret = read_alloc_one_name(leaf, (victim_ref + 1), - btrfs_inode_ref_name_len(leaf, victim_ref), - &victim_name); - if (ret) - return ret; - - ret = backref_in_log(log_root, &search_key, - parent_objectid, &victim_name); - if (ret < 0) { - kfree(victim_name.name); - return ret; - } else if (!ret) { - inc_nlink(&inode->vfs_inode); - btrfs_release_path(path); - - ret = unlink_inode_for_log_replay(trans, dir, inode, - &victim_name); - kfree(victim_name.name); - if (ret) - return ret; - goto again; - } - kfree(victim_name.name); - - ptr = (unsigned long)(victim_ref + 1) + victim_name.len; - } + ret = unlink_refs_not_in_log(trans, path, log_root, &search_key, + dir, inode, parent_objectid); + if (ret == -EAGAIN) + goto again; + else if (ret) + return ret; } btrfs_release_path(path); /* Same search but for extended refs */ - extref = btrfs_lookup_inode_extref(NULL, root, path, name, - inode_objectid, parent_objectid, 0, - 0); + extref = btrfs_lookup_inode_extref(root, path, name, inode_objectid, parent_objectid); if (IS_ERR(extref)) { return PTR_ERR(extref); } else if (extref) { - u32 item_size; - u32 cur_offset = 0; - unsigned long base; - struct inode *victim_parent; - - leaf = path->nodes[0]; - - item_size = btrfs_item_size(leaf, path->slots[0]); - base = btrfs_item_ptr_offset(leaf, path->slots[0]); - - while (cur_offset < item_size) { - struct fscrypt_str victim_name; - - extref = (struct btrfs_inode_extref *)(base + cur_offset); - - if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) - goto next; - - ret = read_alloc_one_name(leaf, &extref->name, - btrfs_inode_extref_name_len(leaf, extref), - &victim_name); - if (ret) - return ret; - - search_key.objectid = inode_objectid; - search_key.type = BTRFS_INODE_EXTREF_KEY; - search_key.offset = btrfs_extref_hash(parent_objectid, - victim_name.name, - victim_name.len); - ret = backref_in_log(log_root, &search_key, - parent_objectid, &victim_name); - if (ret < 0) { - kfree(victim_name.name); - return ret; - } else if (!ret) { - ret = -ENOENT; - victim_parent = read_one_inode(root, - parent_objectid); - if (victim_parent) { - inc_nlink(&inode->vfs_inode); - btrfs_release_path(path); - - ret = unlink_inode_for_log_replay(trans, - BTRFS_I(victim_parent), - inode, &victim_name); - } - iput(victim_parent); - kfree(victim_name.name); - if (ret) - return ret; - goto again; - } - kfree(victim_name.name); -next: - cur_offset += victim_name.len + sizeof(*extref); - } + ret = unlink_extrefs_not_in_log(trans, path, root, log_root, + &search_key, inode, + inode_objectid, parent_objectid); + if (ret == -EAGAIN) + goto again; + else if (ret) + return ret; } btrfs_release_path(path); @@ -1326,19 +1336,18 @@ again: ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name); if (!ret) { - struct inode *dir; + struct btrfs_inode *dir; btrfs_release_path(path); - dir = read_one_inode(root, parent_id); - if (!dir) { - ret = -ENOENT; + dir = btrfs_iget_logging(parent_id, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); kfree(name.name); goto out; } - ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), - inode, &name); + ret = unlink_inode_for_log_replay(trans, dir, inode, &name); kfree(name.name); - iput(dir); + iput(&dir->vfs_inode); if (ret) goto out; goto again; @@ -1370,13 +1379,13 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - struct inode *dir = NULL; - struct inode *inode = NULL; + struct btrfs_inode *dir = NULL; + struct btrfs_inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; struct fscrypt_str name = { 0 }; int ret; - int log_ref_ver = 0; + const bool is_extref_item = (key->type == BTRFS_INODE_EXTREF_KEY); u64 parent_objectid; u64 inode_objectid; u64 ref_index = 0; @@ -1385,11 +1394,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, ref_ptr = btrfs_item_ptr_offset(eb, slot); ref_end = ref_ptr + btrfs_item_size(eb, slot); - if (key->type == BTRFS_INODE_EXTREF_KEY) { + if (is_extref_item) { struct btrfs_inode_extref *r; ref_struct_size = sizeof(struct btrfs_inode_extref); - log_ref_ver = 1; r = (struct btrfs_inode_extref *)ref_ptr; parent_objectid = btrfs_inode_extref_parent(eb, r); } else { @@ -1404,40 +1412,64 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * copy the back ref in. The link count fixup code will take * care of the rest */ - dir = read_one_inode(root, parent_objectid); - if (!dir) { - ret = -ENOENT; + dir = btrfs_iget_logging(parent_objectid, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + if (ret == -ENOENT) + ret = 0; + dir = NULL; goto out; } - inode = read_one_inode(root, inode_objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(inode_objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; goto out; } while (ref_ptr < ref_end) { - if (log_ref_ver) { + if (is_extref_item) { ret = extref_get_fields(eb, ref_ptr, &name, &ref_index, &parent_objectid); + if (ret) + goto out; /* * parent object can change from one array * item to another. */ - if (!dir) - dir = read_one_inode(root, parent_objectid); if (!dir) { - ret = -ENOENT; - goto out; + dir = btrfs_iget_logging(parent_objectid, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + dir = NULL; + /* + * A new parent dir may have not been + * logged and not exist in the subvolume + * tree, see the comment above before + * the loop when getting the first + * parent dir. + */ + if (ret == -ENOENT) { + /* + * The next extref may refer to + * another parent dir that + * exists, so continue. + */ + ret = 0; + goto next; + } + goto out; + } } } else { ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); + if (ret) + goto out; } - if (ret) - goto out; - ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), - btrfs_ino(BTRFS_I(inode)), ref_index, &name); + ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), + ref_index, &name); if (ret < 0) { goto out; } else if (ret == 0) { @@ -1448,8 +1480,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * overwrite any existing back reference, and we don't * want to create dangling pointers in the directory. */ - ret = __add_inode_ref(trans, root, path, log, - BTRFS_I(dir), BTRFS_I(inode), + ret = __add_inode_ref(trans, root, path, log, dir, inode, inode_objectid, parent_objectid, ref_index, &name); if (ret) { @@ -1459,22 +1490,22 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, } /* insert our name */ - ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - &name, 0, ref_index); + ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index); if (ret) goto out; - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, inode); if (ret) goto out; } /* Else, ret == 1, we already have a perfect match, we're done. */ +next: ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len; kfree(name.name); name.name = NULL; - if (log_ref_ver) { - iput(dir); + if (is_extref_item && dir) { + iput(&dir->vfs_inode); dir = NULL; } } @@ -1487,8 +1518,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * dir index entries exist for a name but there is no inode reference * item with the same name. */ - ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, - key); + ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key); if (ret) goto out; @@ -1497,8 +1527,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); kfree(name.name); - iput(dir); - iput(inode); + if (dir) + iput(&dir->vfs_inode); + if (inode) + iput(&inode->vfs_inode); return ret; } @@ -1612,25 +1644,25 @@ process_slot: * will free the inode. */ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, - struct inode *inode) + struct btrfs_inode *inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_path *path; int ret; u64 nlink = 0; - u64 ino = btrfs_ino(BTRFS_I(inode)); + const u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); if (!path) return -ENOMEM; - ret = count_inode_refs(BTRFS_I(inode), path); + ret = count_inode_refs(inode, path); if (ret < 0) goto out; nlink = ret; - ret = count_inode_extrefs(BTRFS_I(inode), path); + ret = count_inode_extrefs(inode, path); if (ret < 0) goto out; @@ -1638,19 +1670,18 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, ret = 0; - if (nlink != inode->i_nlink) { - set_nlink(inode, nlink); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + if (nlink != inode->vfs_inode.i_nlink) { + set_nlink(&inode->vfs_inode, nlink); + ret = btrfs_update_inode(trans, inode); if (ret) goto out; } - if (S_ISDIR(inode->i_mode)) - BTRFS_I(inode)->index_cnt = (u64)-1; + if (S_ISDIR(inode->vfs_inode.i_mode)) + inode->index_cnt = (u64)-1; - if (inode->i_nlink == 0) { - if (S_ISDIR(inode->i_mode)) { - ret = replay_dir_deletes(trans, root, NULL, path, - ino, 1); + if (inode->vfs_inode.i_nlink == 0) { + if (S_ISDIR(inode->vfs_inode.i_mode)) { + ret = replay_dir_deletes(trans, root, NULL, path, ino, true); if (ret) goto out; } @@ -1670,12 +1701,13 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, { int ret; struct btrfs_key key; - struct inode *inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = (u64)-1; while (1) { + struct btrfs_inode *inode; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) break; @@ -1697,14 +1729,14 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, break; btrfs_release_path(path); - inode = read_one_inode(root, key.offset); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(key.offset, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); break; } ret = fixup_inode_link_count(trans, inode); - iput(inode); + iput(&inode->vfs_inode); if (ret) break; @@ -1732,12 +1764,14 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, { struct btrfs_key key; int ret = 0; - struct inode *inode; + struct btrfs_inode *inode; + struct inode *vfs_inode; - inode = read_one_inode(root, objectid); - if (!inode) - return -EIO; + inode = btrfs_iget_logging(objectid, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); + vfs_inode = &inode->vfs_inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = objectid; @@ -1746,15 +1780,15 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (ret == 0) { - if (!inode->i_nlink) - set_nlink(inode, 1); + if (!vfs_inode->i_nlink) + set_nlink(vfs_inode, 1); else - inc_nlink(inode); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + inc_nlink(vfs_inode); + ret = btrfs_update_inode(trans, inode); } else if (ret == -EEXIST) { ret = 0; } - iput(inode); + iput(vfs_inode); return ret; } @@ -1770,27 +1804,26 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, const struct fscrypt_str *name, struct btrfs_key *location) { - struct inode *inode; - struct inode *dir; + struct btrfs_inode *inode; + struct btrfs_inode *dir; int ret; - inode = read_one_inode(root, location->objectid); - if (!inode) - return -ENOENT; + inode = btrfs_iget_logging(location->objectid, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); - dir = read_one_inode(root, dirid); - if (!dir) { - iput(inode); - return -EIO; + dir = btrfs_iget_logging(dirid, root); + if (IS_ERR(dir)) { + iput(&inode->vfs_inode); + return PTR_ERR(dir); } - ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, - 1, index); + ret = btrfs_add_link(trans, dir, inode, name, 1, index); /* FIXME, put inode into FIXUP list */ - iput(inode); - iput(dir); + iput(&inode->vfs_inode); + iput(&dir->vfs_inode); return ret; } @@ -1852,16 +1885,16 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, bool index_dst_matches = false; struct btrfs_key log_key; struct btrfs_key search_key; - struct inode *dir; + struct btrfs_inode *dir; u8 log_flags; bool exists; int ret; bool update_size = true; bool name_added = false; - dir = read_one_inode(root, key->objectid); - if (!dir) - return -EIO; + dir = btrfs_iget_logging(key->objectid, root); + if (IS_ERR(dir)) + return PTR_ERR(dir); ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); if (ret) @@ -1882,9 +1915,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = PTR_ERR(dir_dst_di); goto out; } else if (dir_dst_di) { - ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, - dir_dst_di, &log_key, - log_flags, exists); + ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di, + &log_key, log_flags, exists); if (ret < 0) goto out; dir_dst_matches = (ret == 1); @@ -1899,9 +1931,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = PTR_ERR(index_dst_di); goto out; } else if (index_dst_di) { - ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, - index_dst_di, &log_key, - log_flags, exists); + ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di, + &log_key, log_flags, exists); if (ret < 0) goto out; index_dst_matches = (ret == 1); @@ -1956,11 +1987,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, out: if (!ret && update_size) { - btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2); - ret = btrfs_update_inode(trans, BTRFS_I(dir)); + btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2); + ret = btrfs_update_inode(trans, dir); } kfree(name.name); - iput(dir); + iput(&dir->vfs_inode); if (!ret && name_added) ret = 1; return ret; @@ -2117,16 +2148,16 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, struct btrfs_path *log_path, - struct inode *dir, + struct btrfs_inode *dir, struct btrfs_key *dir_key) { - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *root = dir->root; int ret; struct extent_buffer *eb; int slot; struct btrfs_dir_item *di; struct fscrypt_str name = { 0 }; - struct inode *inode = NULL; + struct btrfs_inode *inode = NULL; struct btrfs_key location; /* @@ -2163,9 +2194,10 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, btrfs_dir_item_key_to_cpu(eb, di, &location); btrfs_release_path(path); btrfs_release_path(log_path); - inode = read_one_inode(root, location.objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(location.objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; goto out; } @@ -2173,9 +2205,8 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, if (ret) goto out; - inc_nlink(inode); - ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), - &name); + inc_nlink(&inode->vfs_inode); + ret = unlink_inode_for_log_replay(trans, dir, inode, &name); /* * Unlike dir item keys, dir index keys can only have one name (entry) in * them, as there are no key collisions since each key has a unique offset @@ -2185,7 +2216,8 @@ out: btrfs_release_path(path); btrfs_release_path(log_path); kfree(name.name); - iput(inode); + if (inode) + iput(&inode->vfs_inode); return ret; } @@ -2301,7 +2333,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path, - u64 dirid, int del_all) + u64 dirid, bool del_all) { u64 range_start; u64 range_end; @@ -2309,7 +2341,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_key dir_key; struct btrfs_key found_key; struct btrfs_path *log_path; - struct inode *dir; + struct btrfs_inode *dir; dir_key.objectid = dirid; dir_key.type = BTRFS_DIR_INDEX_KEY; @@ -2317,14 +2349,17 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, if (!log_path) return -ENOMEM; - dir = read_one_inode(root, dirid); - /* it isn't an error if the inode isn't there, that can happen - * because we replay the deletes before we copy in the inode item - * from the log + dir = btrfs_iget_logging(dirid, root); + /* + * It isn't an error if the inode isn't there, that can happen because + * we replay the deletes before we copy in the inode item from the log. */ - if (!dir) { + if (IS_ERR(dir)) { btrfs_free_path(log_path); - return 0; + ret = PTR_ERR(dir); + if (ret == -ENOENT) + ret = 0; + return ret; } range_start = 0; @@ -2386,7 +2421,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); btrfs_free_path(log_path); - iput(dir); + iput(&dir->vfs_inode); return ret; } @@ -2460,8 +2495,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, break; mode = btrfs_inode_mode(eb, inode_item); if (S_ISDIR(mode)) { - ret = replay_dir_deletes(wc->trans, - root, log, path, key.objectid, 0); + ret = replay_dir_deletes(wc->trans, root, log, path, + key.objectid, false); if (ret) break; } @@ -2480,30 +2515,28 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, */ if (S_ISREG(mode)) { struct btrfs_drop_extents_args drop_args = { 0 }; - struct inode *inode; + struct btrfs_inode *inode; u64 from; - inode = read_one_inode(root, key.objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(key.objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); break; } - from = ALIGN(i_size_read(inode), + from = ALIGN(i_size_read(&inode->vfs_inode), root->fs_info->sectorsize); drop_args.start = from; drop_args.end = (u64)-1; drop_args.drop_cache = true; - ret = btrfs_drop_extents(wc->trans, root, - BTRFS_I(inode), + ret = btrfs_drop_extents(wc->trans, root, inode, &drop_args); if (!ret) { - inode_sub_bytes(inode, + inode_sub_bytes(&inode->vfs_inode, drop_args.bytes_found); /* Update the inode's nbytes. */ - ret = btrfs_update_inode(wc->trans, - BTRFS_I(inode)); + ret = btrfs_update_inode(wc->trans, inode); } - iput(inode); + iput(&inode->vfs_inode); if (ret) break; } @@ -2538,9 +2571,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, key.type == BTRFS_INODE_EXTREF_KEY) { ret = add_inode_ref(wc->trans, root, log, path, eb, i, &key); - if (ret && ret != -ENOENT) + if (ret) break; - ret = 0; } else if (key.type == BTRFS_EXTENT_DATA_KEY) { ret = replay_one_extent(wc->trans, root, path, eb, i, &key); @@ -2739,7 +2771,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, level = btrfs_header_level(log->node); orig_level = level; path->nodes[level] = log->node; - atomic_inc(&log->node->refs); + refcount_inc(&log->node->refs); path->slots[level] = 0; while (1) { @@ -2980,9 +3012,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } if (log_transid % 2 == 0) - mark = EXTENT_DIRTY; + mark = EXTENT_DIRTY_LOG1; else - mark = EXTENT_NEW; + mark = EXTENT_DIRTY_LOG2; /* we start IO on all the marked extents here, but we don't actually * wait for them until later. @@ -3113,7 +3145,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_write_marked_extents(fs_info, &log_root_tree->dirty_log_pages, - EXTENT_DIRTY | EXTENT_NEW); + EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2); blk_finish_plug(&plug); /* * As described above, -EAGAIN indicates a hole in the extents. We @@ -3133,7 +3165,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_wait_tree_log_extents(log, mark); if (!ret) ret = btrfs_wait_tree_log_extents(log_root_tree, - EXTENT_NEW | EXTENT_DIRTY); + EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2); if (ret) { btrfs_set_log_full_commit(trans); mutex_unlock(&log_root_tree->log_mutex); @@ -3259,9 +3291,9 @@ static void free_log_tree(struct btrfs_trans_handle *trans, */ btrfs_write_marked_extents(log->fs_info, &log->dirty_log_pages, - EXTENT_DIRTY | EXTENT_NEW); + EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2); btrfs_wait_tree_log_extents(log, - EXTENT_DIRTY | EXTENT_NEW); + EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2); if (trans) btrfs_abort_transaction(trans, ret); @@ -3270,8 +3302,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans, } } - extent_io_tree_release(&log->dirty_log_pages); - extent_io_tree_release(&log->log_csum_range); + btrfs_extent_io_tree_release(&log->dirty_log_pages); + btrfs_extent_io_tree_release(&log->log_csum_range); btrfs_put_root(log); } @@ -3451,7 +3483,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans, * inode item because on log replay we update the field to reflect * all existing entries in the directory (see overwrite_item()). */ - return btrfs_delete_one_dir_name(trans, log, path, di); + return btrfs_del_item(trans, log, path); } /* @@ -3491,26 +3523,27 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, return; } - ret = join_running_log_trans(root); - if (ret) - return; - - mutex_lock(&dir->log_mutex); - path = btrfs_alloc_path(); if (!path) { - ret = -ENOMEM; - goto out_unlock; + btrfs_set_log_full_commit(trans); + return; } + ret = join_running_log_trans(root); + ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret); + if (WARN_ON(ret)) + goto out; + + mutex_lock(&dir->log_mutex); + ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir), name, index); - btrfs_free_path(path); -out_unlock: mutex_unlock(&dir->log_mutex); if (ret < 0) btrfs_set_log_full_commit(trans); btrfs_end_log_trans(root); +out: + btrfs_free_path(path); } /* see comments for btrfs_del_dir_entries_in_log */ @@ -3520,7 +3553,6 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 dirid) { struct btrfs_root *log; - u64 index; int ret; ret = inode_logged(trans, inode, NULL); @@ -3532,13 +3564,13 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, } ret = join_running_log_trans(root); - if (ret) + ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret); + if (WARN_ON(ret)) return; log = root->log_root; mutex_lock(&inode->log_mutex); - ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode), - dirid, &index); + ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode), dirid, NULL); mutex_unlock(&inode->log_mutex); if (ret < 0 && ret != -ENOENT) btrfs_set_log_full_commit(trans); @@ -3561,8 +3593,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, struct btrfs_dir_log_item *item; key.objectid = dirid; - key.offset = first_offset; key.type = BTRFS_DIR_LOG_INDEX_KEY; + key.offset = first_offset; ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); /* * -EEXIST is fine and can happen sporadically when we are logging a @@ -3588,7 +3620,6 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, last_offset = max(last_offset, curr_end); } btrfs_set_dir_log_end(path->nodes[0], item, last_offset); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); return 0; } @@ -3704,7 +3735,7 @@ static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx) * Add extra ref to scratch eb so that it is not freed when callers * release the path, so we can reuse it later if needed. */ - atomic_inc(&ctx->scratch_eb->refs); + refcount_inc(&ctx->scratch_eb->refs); return 0; } @@ -4192,44 +4223,37 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct inode *inode, int log_inode_only, u64 logged_isize) { - struct btrfs_map_token token; u64 flags; - btrfs_init_map_token(&token, leaf); - if (log_inode_only) { /* set the generation to zero so the recover code * can tell the difference between an logging * just to say 'this inode exists' and a logging * to say 'update this inode with these values' */ - btrfs_set_token_inode_generation(&token, item, 0); - btrfs_set_token_inode_size(&token, item, logged_isize); + btrfs_set_inode_generation(leaf, item, 0); + btrfs_set_inode_size(leaf, item, logged_isize); } else { - btrfs_set_token_inode_generation(&token, item, - BTRFS_I(inode)->generation); - btrfs_set_token_inode_size(&token, item, inode->i_size); + btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); + btrfs_set_inode_size(leaf, item, inode->i_size); } - btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); - btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); - btrfs_set_token_inode_mode(&token, item, inode->i_mode); - btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); + btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); + btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); + btrfs_set_inode_mode(leaf, item, inode->i_mode); + btrfs_set_inode_nlink(leaf, item, inode->i_nlink); + + btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode)); + btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode)); - btrfs_set_token_timespec_sec(&token, &item->atime, - inode_get_atime_sec(inode)); - btrfs_set_token_timespec_nsec(&token, &item->atime, - inode_get_atime_nsec(inode)); + btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode)); + btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode)); - btrfs_set_token_timespec_sec(&token, &item->mtime, - inode_get_mtime_sec(inode)); - btrfs_set_token_timespec_nsec(&token, &item->mtime, - inode_get_mtime_nsec(inode)); + btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode)); + btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode)); - btrfs_set_token_timespec_sec(&token, &item->ctime, - inode_get_ctime_sec(inode)); - btrfs_set_token_timespec_nsec(&token, &item->ctime, - inode_get_ctime_nsec(inode)); + btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec); + btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec); /* * We do not need to set the nbytes field, in fact during a fast fsync @@ -4240,13 +4264,13 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, * inode item in subvolume tree as needed (see overwrite_item()). */ - btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); - btrfs_set_token_inode_transid(&token, item, trans->transid); - btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); + btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode)); + btrfs_set_inode_transid(leaf, item, trans->transid); + btrfs_set_inode_rdev(leaf, item, inode->i_rdev); flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, BTRFS_I(inode)->ro_flags); - btrfs_set_token_inode_flags(&token, item, flags); - btrfs_set_token_inode_block_group(&token, item, 0); + btrfs_set_inode_flags(leaf, item, flags); + btrfs_set_inode_block_group(leaf, item, 0); } static int log_inode_item(struct btrfs_trans_handle *trans, @@ -4320,8 +4344,8 @@ static int log_csums(struct btrfs_trans_handle *trans, * file which happens to refer to the same extent as well. Such races * can leave checksum items in the log with overlapping ranges. */ - ret = lock_extent(&log_root->log_csum_range, sums->logical, lock_end, - &cached_state); + ret = btrfs_lock_extent(&log_root->log_csum_range, sums->logical, lock_end, + &cached_state); if (ret) return ret; /* @@ -4337,8 +4361,8 @@ static int log_csums(struct btrfs_trans_handle *trans, if (!ret) ret = btrfs_csum_file_blocks(trans, log_root, sums); - unlock_extent(&log_root->log_csum_range, sums->logical, lock_end, - &cached_state); + btrfs_unlock_extent(&log_root->log_csum_range, sums->logical, lock_end, + &cached_state); return ret; } @@ -4566,7 +4590,6 @@ copy_item: dst_index++; } - btrfs_mark_buffer_dirty(trans, dst_path->nodes[0]); btrfs_release_path(dst_path); out: kfree(ins_data); @@ -4669,7 +4692,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, return 0; /* If we're compressed we have to save the entire range of csums. */ - if (extent_map_is_compressed(em)) { + if (btrfs_extent_map_is_compressed(em)) { csum_offset = 0; csum_len = em->disk_num_bytes; } else { @@ -4678,7 +4701,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, } /* block start is already adjusted for the file extent offset. */ - block_start = extent_map_block_start(em); + block_start = btrfs_extent_map_block_start(em); csum_root = btrfs_csum_root(trans->fs_info, block_start); ret = btrfs_lookup_csums_list(csum_root, block_start + csum_offset, block_start + csum_offset + csum_len - 1, @@ -4688,9 +4711,9 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, ret = 0; while (!list_empty(&ordered_sums)) { - struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, - struct btrfs_ordered_sum, - list); + struct btrfs_ordered_sum *sums = list_first_entry(&ordered_sums, + struct btrfs_ordered_sum, + list); if (!ret) ret = log_csums(trans, inode, log_root, sums); list_del(&sums->list); @@ -4713,7 +4736,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_key key; enum btrfs_compression_type compress_type; u64 extent_offset = em->offset; - u64 block_start = extent_map_block_start(em); + u64 block_start = btrfs_extent_map_block_start(em); u64 block_len; int ret; @@ -4724,7 +4747,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); block_len = em->disk_num_bytes; - compress_type = extent_map_compression(em); + compress_type = btrfs_extent_map_compression(em); if (compress_type != BTRFS_COMPRESS_NONE) { btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start); btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); @@ -4776,7 +4799,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, &fi, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(fi)); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); @@ -4969,7 +4991,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, list_sort(NULL, &extents, extent_cmp); process: while (!list_empty(&extents)) { - em = list_entry(extents.next, struct extent_map, list); + em = list_first_entry(&extents, struct extent_map, list); list_del_init(&em->list); @@ -4978,8 +5000,8 @@ process: * private list. */ if (ret) { - clear_em_logging(inode, em); - free_extent_map(em); + btrfs_clear_em_logging(inode, em); + btrfs_free_extent_map(em); continue; } @@ -4987,8 +5009,8 @@ process: ret = log_one_extent(trans, inode, em, path, ctx); write_lock(&tree->lock); - clear_em_logging(inode, em); - free_extent_map(em); + btrfs_clear_em_logging(inode, em); + btrfs_free_extent_map(em); } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); @@ -5485,7 +5507,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, ihold(&curr_inode->vfs_inode); while (true) { - struct inode *vfs_inode; struct btrfs_key key; struct btrfs_key found_key; u64 next_index; @@ -5501,7 +5522,7 @@ again: struct extent_buffer *leaf = path->nodes[0]; struct btrfs_dir_item *di; struct btrfs_key di_key; - struct inode *di_inode; + struct btrfs_inode *di_inode; int log_mode = LOG_INODE_EXISTS; int type; @@ -5528,17 +5549,16 @@ again: goto out; } - if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + if (!need_log_inode(trans, di_inode)) { + btrfs_add_delayed_iput(di_inode); break; } ctx->log_new_dentries = false; if (type == BTRFS_FT_DIR) log_mode = LOG_INODE_ALL; - ret = btrfs_log_inode(trans, BTRFS_I(di_inode), - log_mode, ctx); - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + ret = btrfs_log_inode(trans, di_inode, log_mode, ctx); + btrfs_add_delayed_iput(di_inode); if (ret) goto out; if (ctx->log_new_dentries) { @@ -5580,14 +5600,13 @@ again: kfree(dir_elem); btrfs_add_delayed_iput(curr_inode); - curr_inode = NULL; - vfs_inode = btrfs_iget_logging(ino, root); - if (IS_ERR(vfs_inode)) { - ret = PTR_ERR(vfs_inode); + curr_inode = btrfs_iget_logging(ino, root); + if (IS_ERR(curr_inode)) { + ret = PTR_ERR(curr_inode); + curr_inode = NULL; break; } - curr_inode = BTRFS_I(vfs_inode); } out: btrfs_free_path(path); @@ -5665,7 +5684,7 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { struct btrfs_ino_list *ino_elem; - struct inode *inode; + struct btrfs_inode *inode; /* * It's rare to have a lot of conflicting inodes, in practice it is not @@ -5756,12 +5775,12 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, * inode in LOG_INODE_EXISTS mode and rename operations update the log, * so that the log ends up with the new name and without the old name. */ - if (!need_log_inode(trans, BTRFS_I(inode))) { - btrfs_add_delayed_iput(BTRFS_I(inode)); + if (!need_log_inode(trans, inode)) { + btrfs_add_delayed_iput(inode); return 0; } - btrfs_add_delayed_iput(BTRFS_I(inode)); + btrfs_add_delayed_iput(inode); ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); if (!ino_elem) @@ -5797,7 +5816,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, */ while (!list_empty(&ctx->conflict_inodes)) { struct btrfs_ino_list *curr; - struct inode *inode; + struct btrfs_inode *inode; u64 ino; u64 parent; @@ -5833,9 +5852,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * dir index key range logged for the directory. So we * must make sure the deletion is recorded. */ - ret = btrfs_log_inode(trans, BTRFS_I(inode), - LOG_INODE_ALL, ctx); - btrfs_add_delayed_iput(BTRFS_I(inode)); + ret = btrfs_log_inode(trans, inode, LOG_INODE_ALL, ctx); + btrfs_add_delayed_iput(inode); if (ret) break; continue; @@ -5851,8 +5869,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * it again because if some other task logged the inode after * that, we can avoid doing it again. */ - if (!need_log_inode(trans, BTRFS_I(inode))) { - btrfs_add_delayed_iput(BTRFS_I(inode)); + if (!need_log_inode(trans, inode)) { + btrfs_add_delayed_iput(inode); continue; } @@ -5863,8 +5881,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * well because during a rename we pin the log and update the * log with the new name before we unpin it. */ - ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); - btrfs_add_delayed_iput(BTRFS_I(inode)); + ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx); + btrfs_add_delayed_iput(inode); if (ret) break; } @@ -6355,7 +6373,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, list_for_each_entry(item, delayed_ins_list, log_list) { struct btrfs_dir_item *dir_item; - struct inode *di_inode; + struct btrfs_inode *di_inode; struct btrfs_key key; int log_mode = LOG_INODE_EXISTS; @@ -6371,8 +6389,8 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, break; } - if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + if (!need_log_inode(trans, di_inode)) { + btrfs_add_delayed_iput(di_inode); continue; } @@ -6380,12 +6398,12 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, log_mode = LOG_INODE_ALL; ctx->log_new_dentries = false; - ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); + ret = btrfs_log_inode(trans, di_inode, log_mode, ctx); if (!ret && ctx->log_new_dentries) - ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx); + ret = log_new_dir_dentries(trans, di_inode, ctx); - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + btrfs_add_delayed_iput(di_inode); if (ret) break; @@ -6609,6 +6627,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, btrfs_log_get_delayed_items(inode, &delayed_ins_list, &delayed_del_list); + /* + * If we are fsyncing a file with 0 hard links, then commit the delayed + * inode because the last inode ref (or extref) item may still be in the + * subvolume tree and if we log it the file will still exist after a log + * replay. So commit the delayed inode to delete that last ref and we + * skip logging it. + */ + if (inode->vfs_inode.i_nlink == 0) { + ret = btrfs_commit_inode_delayed_inode(inode); + if (ret) + goto out_unlock; + } + ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key, path, dst_path, logged_isize, inode_only, ctx, @@ -6793,7 +6824,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr_offset(leaf, slot); while (cur_offset < item_size) { struct btrfs_key inode_key; - struct inode *dir_inode; + struct btrfs_inode *dir_inode; inode_key.type = BTRFS_INODE_ITEM_KEY; inode_key.offset = 0; @@ -6842,18 +6873,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, goto out; } - if (!need_log_inode(trans, BTRFS_I(dir_inode))) { - btrfs_add_delayed_iput(BTRFS_I(dir_inode)); + if (!need_log_inode(trans, dir_inode)) { + btrfs_add_delayed_iput(dir_inode); continue; } ctx->log_new_dentries = false; - ret = btrfs_log_inode(trans, BTRFS_I(dir_inode), - LOG_INODE_ALL, ctx); + ret = btrfs_log_inode(trans, dir_inode, LOG_INODE_ALL, ctx); if (!ret && ctx->log_new_dentries) - ret = log_new_dir_dentries(trans, - BTRFS_I(dir_inode), ctx); - btrfs_add_delayed_iput(BTRFS_I(dir_inode)); + ret = log_new_dir_dentries(trans, dir_inode, ctx); + btrfs_add_delayed_iput(dir_inode); if (ret) goto out; } @@ -6878,7 +6907,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; int slot; struct btrfs_key search_key; - struct inode *inode; + struct btrfs_inode *inode; u64 ino; int ret = 0; @@ -6893,11 +6922,10 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) return PTR_ERR(inode); - if (BTRFS_I(inode)->generation >= trans->transid && - need_log_inode(trans, BTRFS_I(inode))) - ret = btrfs_log_inode(trans, BTRFS_I(inode), - LOG_INODE_EXISTS, ctx); - btrfs_add_delayed_iput(BTRFS_I(inode)); + if (inode->generation >= trans->transid && + need_log_inode(trans, inode)) + ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx); + btrfs_add_delayed_iput(inode); if (ret) return ret; @@ -7065,42 +7093,29 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; - bool log_dentries = false; + bool log_dentries; - if (btrfs_test_opt(fs_info, NOTREELOG)) { - ret = BTRFS_LOG_FORCE_COMMIT; - goto end_no_trans; - } + if (btrfs_test_opt(fs_info, NOTREELOG)) + return BTRFS_LOG_FORCE_COMMIT; - if (btrfs_root_refs(&root->root_item) == 0) { - ret = BTRFS_LOG_FORCE_COMMIT; - goto end_no_trans; - } + if (btrfs_root_refs(&root->root_item) == 0) + return BTRFS_LOG_FORCE_COMMIT; /* * If we're logging an inode from a subvolume created in the current * transaction we must force a commit since the root is not persisted. */ - if (btrfs_root_generation(&root->root_item) == trans->transid) { - ret = BTRFS_LOG_FORCE_COMMIT; - goto end_no_trans; - } + if (btrfs_root_generation(&root->root_item) == trans->transid) + return BTRFS_LOG_FORCE_COMMIT; - /* - * Skip already logged inodes or inodes corresponding to tmpfiles - * (since logging them is pointless, a link count of 0 means they - * will never be accessible). - */ - if ((btrfs_inode_in_log(inode, trans->transid) && - list_empty(&ctx->ordered_extents)) || - inode->vfs_inode.i_nlink == 0) { - ret = BTRFS_NO_LOG_SYNC; - goto end_no_trans; - } + /* Skip already logged inodes and without new extents. */ + if (btrfs_inode_in_log(inode, trans->transid) && + list_empty(&ctx->ordered_extents)) + return BTRFS_NO_LOG_SYNC; ret = start_log_trans(trans, root, ctx); if (ret) - goto end_no_trans; + return ret; ret = btrfs_log_inode(trans, inode, inode_only, ctx); if (ret) @@ -7119,8 +7134,11 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } - if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries) - log_dentries = true; + /* + * Track if we need to log dentries because ctx->log_new_dentries can + * be modified in the call chains below. + */ + log_dentries = ctx->log_new_dentries; /* * On unlink we must make sure all our current and old parent directory @@ -7175,8 +7193,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (log_dentries) ret = log_new_dir_dentries(trans, inode, ctx); - else - ret = 0; end_trans: if (ret < 0) { btrfs_set_log_full_commit(trans); @@ -7186,7 +7202,7 @@ end_trans: if (ret) btrfs_remove_log_ctx(root, ctx); btrfs_end_log_trans(root); -end_no_trans: + return ret; } @@ -7220,8 +7236,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) struct btrfs_path *path; struct btrfs_trans_handle *trans; struct btrfs_key key; - struct btrfs_key found_key; - struct btrfs_root *log; struct btrfs_fs_info *fs_info = log_root_tree->fs_info; struct walk_control wc = { .process_func = process_one_buffer, @@ -7251,10 +7265,13 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) again: key.objectid = BTRFS_TREE_LOG_OBJECTID; - key.offset = (u64)-1; key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; while (1) { + struct btrfs_root *log; + struct btrfs_key found_key; + ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); if (ret < 0) { @@ -7283,6 +7300,12 @@ again: true); if (IS_ERR(wc.replay_dest)) { ret = PTR_ERR(wc.replay_dest); + wc.replay_dest = NULL; + if (ret != -ENOENT) { + btrfs_put_root(log); + btrfs_abort_transaction(trans, ret); + goto error; + } /* * We didn't find the subvol, likely because it was @@ -7295,36 +7318,36 @@ again: * block from being modified, and we'll just bail for * each subsequent pass. */ - if (ret == -ENOENT) - ret = btrfs_pin_extent_for_log_replay(trans, log->node); - btrfs_put_root(log); - - if (!ret) - goto next; - btrfs_abort_transaction(trans, ret); - goto error; + ret = btrfs_pin_extent_for_log_replay(trans, log->node); + if (ret) { + btrfs_put_root(log); + btrfs_abort_transaction(trans, ret); + goto error; + } + goto next; } wc.replay_dest->log_root = log; ret = btrfs_record_root_in_trans(trans, wc.replay_dest); - if (ret) - /* The loop needs to continue due to the root refs */ + if (ret) { btrfs_abort_transaction(trans, ret); - else - ret = walk_log_tree(trans, log, &wc); + goto next; + } - if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { - ret = fixup_inode_link_counts(trans, wc.replay_dest, - path); - if (ret) - btrfs_abort_transaction(trans, ret); + ret = walk_log_tree(trans, log, &wc); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto next; } - if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { + if (wc.stage == LOG_WALK_REPLAY_ALL) { struct btrfs_root *root = wc.replay_dest; - btrfs_release_path(path); - + ret = fixup_inode_link_counts(trans, wc.replay_dest, path); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto next; + } /* * We have just replayed everything, and the highest * objectid of fs roots probably has changed in case @@ -7334,17 +7357,20 @@ again: * could only happen during mount. */ ret = btrfs_init_root_free_objectid(root); - if (ret) + if (ret) { btrfs_abort_transaction(trans, ret); + goto next; + } + } +next: + if (wc.replay_dest) { + wc.replay_dest->log_root = NULL; + btrfs_put_root(wc.replay_dest); } - - wc.replay_dest->log_root = NULL; - btrfs_put_root(wc.replay_dest); btrfs_put_root(log); if (ret) goto error; -next: if (found_key.offset == 0) break; key.offset = found_key.offset - 1; @@ -7475,6 +7501,8 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, * full log sync. * Also we don't need to worry with renames, since btrfs_rename() marks the log * for full commit when renaming a subvolume. + * + * Must be called before creating the subvolume entry in its parent directory. */ void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans, struct btrfs_inode *dir) @@ -7511,6 +7539,9 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, bool log_pinned = false; int ret; + btrfs_init_log_ctx(&ctx, inode); + ctx.logging_new_name = true; + /* * this will force the logging code to walk the dentry chain * up for the file @@ -7542,6 +7573,13 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, ret = 0; /* + * Now that we know we need to update the log, allocate the scratch eb + * for the context before joining a log transaction below, as this can + * take time and therefore we could delay log commits from other tasks. + */ + btrfs_init_log_ctx_scratch_eb(&ctx); + + /* * If we are doing a rename (old_dir is not NULL) from a directory that * was previously logged, make sure that on log replay we get the old * dir entry deleted. This is needed because we will also log the new @@ -7559,6 +7597,14 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, &old_dentry->d_name, 0, &fname); if (ret) goto out; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + fscrypt_free_filename(&fname); + goto out; + } + /* * We have two inodes to update in the log, the old directory and * the inode that got renamed, so we must pin the log to prevent @@ -7572,19 +7618,13 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * mark the log for a full commit. */ if (WARN_ON_ONCE(ret < 0)) { + btrfs_free_path(path); fscrypt_free_filename(&fname); goto out; } log_pinned = true; - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - fscrypt_free_filename(&fname); - goto out; - } - /* * Other concurrent task might be logging the old directory, * as it can be triggered when logging other inode that had or @@ -7616,9 +7656,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, goto out; } - btrfs_init_log_ctx(&ctx, inode); - ctx.logging_new_name = true; - btrfs_init_log_ctx_scratch_eb(&ctx); /* * We don't care about the return value. If we fail to log the new name * then we know the next attempt to sync the log will fallback to a full @@ -7627,7 +7664,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * inconsistent state after a rename operation. */ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); - free_extent_buffer(ctx.scratch_eb); ASSERT(list_empty(&ctx.conflict_inodes)); out: /* @@ -7640,5 +7676,6 @@ out: btrfs_set_log_full_commit(trans); if (log_pinned) btrfs_end_log_trans(root); + free_extent_buffer(ctx.scratch_eb); } diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 1ac2678fc4ca..9e8cb3b7c064 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -27,18 +27,29 @@ struct tree_mod_elem { /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */ u64 generation; - /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */ - struct btrfs_disk_key key; - u64 blockptr; - - /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */ - struct { - int dst_slot; - int nr_items; - } move; - - /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */ - struct tree_mod_root old_root; + union { + /* + * This is used for the following op types: + * + * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING + * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING + * BTRFS_MOD_LOG_KEY_REMOVE + * BTRFS_MOD_LOG_KEY_REPLACE + */ + struct { + struct btrfs_disk_key key; + u64 blockptr; + } slot_change; + + /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */ + struct { + int dst_slot; + int nr_items; + } move; + + /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */ + struct tree_mod_root old_root; + }; }; /* @@ -164,6 +175,30 @@ static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info, return 0; } +static inline bool skip_eb_logging(const struct extent_buffer *eb) +{ + const u64 owner = btrfs_header_owner(eb); + + if (btrfs_header_level(eb) == 0) + return true; + + /* + * Tree mod logging exists so that there's a consistent view of the + * extents and backrefs of inodes even if while a task is iterating over + * them other tasks are modifying subvolume trees and the extent tree + * (including running delayed refs). So we only need to log extent + * buffers from the extent tree and subvolume trees. + */ + + if (owner == BTRFS_EXTENT_TREE_OBJECTID) + return false; + + if (btrfs_is_fstree(owner)) + return false; + + return true; +} + /* * Determines if logging can be omitted. Returns true if it can. Otherwise, it * returns false with the tree_mod_log_lock acquired. The caller must hold @@ -174,7 +209,7 @@ static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, const struct extent { if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) return true; - if (eb && btrfs_header_level(eb) == 0) + if (eb && skip_eb_logging(eb)) return true; write_lock(&fs_info->tree_mod_log_lock); @@ -192,7 +227,7 @@ static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, { if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) return false; - if (eb && btrfs_header_level(eb) == 0) + if (eb && skip_eb_logging(eb)) return false; return true; @@ -204,15 +239,17 @@ static struct tree_mod_elem *alloc_tree_mod_elem(const struct extent_buffer *eb, { struct tree_mod_elem *tm; + /* Can't be one of these types, due to union in struct tree_mod_elem. */ + ASSERT(op != BTRFS_MOD_LOG_MOVE_KEYS); + ASSERT(op != BTRFS_MOD_LOG_ROOT_REPLACE); + tm = kzalloc(sizeof(*tm), GFP_NOFS); if (!tm) return NULL; tm->logical = eb->start; - if (op != BTRFS_MOD_LOG_KEY_ADD) { - btrfs_node_key(eb, &tm->key, slot); - tm->blockptr = btrfs_node_blockptr(eb, slot); - } + btrfs_node_key(eb, &tm->slot_change.key, slot); + tm->slot_change.blockptr = btrfs_node_blockptr(eb, slot); tm->op = op; tm->slot = slot; tm->generation = btrfs_node_ptr_generation(eb, slot); @@ -830,8 +867,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, fallthrough; case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING: case BTRFS_MOD_LOG_KEY_REMOVE: - btrfs_set_node_key(eb, &tm->key, tm->slot); - btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_key(eb, &tm->slot_change.key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->slot_change.blockptr); btrfs_set_node_ptr_generation(eb, tm->slot, tm->generation); n++; @@ -840,8 +877,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, break; case BTRFS_MOD_LOG_KEY_REPLACE: BUG_ON(tm->slot >= n); - btrfs_set_node_key(eb, &tm->key, tm->slot); - btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_key(eb, &tm->slot_change.key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->slot_change.blockptr); btrfs_set_node_ptr_generation(eb, tm->slot, tm->generation); break; diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index fc59b57257d6..7e16a253fb35 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -129,21 +129,25 @@ void ulist_free(struct ulist *ulist) kfree(ulist); } +static int ulist_node_val_key_cmp(const void *key, const struct rb_node *node) +{ + const u64 *val = key; + const struct ulist_node *unode = rb_entry(node, struct ulist_node, rb_node); + + if (unode->val < *val) + return 1; + else if (unode->val > *val) + return -1; + + return 0; +} + static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val) { - struct rb_node *n = ulist->root.rb_node; - struct ulist_node *u = NULL; - - while (n) { - u = rb_entry(n, struct ulist_node, rb_node); - if (u->val < val) - n = n->rb_right; - else if (u->val > val) - n = n->rb_left; - else - return u; - } - return NULL; + struct rb_node *node; + + node = rb_find(&val, &ulist->root, ulist_node_val_key_cmp); + return rb_entry_safe(node, struct ulist_node, rb_node); } static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node) @@ -155,25 +159,20 @@ static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node) ulist->nnodes--; } +static int ulist_node_val_cmp(struct rb_node *new, const struct rb_node *existing) +{ + const struct ulist_node *unode = rb_entry(new, struct ulist_node, rb_node); + + return ulist_node_val_key_cmp(&unode->val, existing); +} + static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins) { - struct rb_node **p = &ulist->root.rb_node; - struct rb_node *parent = NULL; - struct ulist_node *cur = NULL; - - while (*p) { - parent = *p; - cur = rb_entry(parent, struct ulist_node, rb_node); - - if (cur->val < ins->val) - p = &(*p)->rb_right; - else if (cur->val > ins->val) - p = &(*p)->rb_left; - else - return -EEXIST; - } - rb_link_node(&ins->rb_node, parent, p); - rb_insert_color(&ins->rb_node, &ulist->root); + struct rb_node *node; + + node = rb_find_add(&ins->rb_node, &ulist->root, ulist_node_val_cmp); + if (node) + return -EEXIST; return 0; } diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index aca2861f2187..17b5e81123a1 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -140,8 +140,6 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ ret = 0; subid_le = cpu_to_le64(subid_cpu); write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le)); - btrfs_mark_buffer_dirty(trans, eb); - out: btrfs_free_path(path); return ret; diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index e97ad824ae16..b7a96a005487 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -485,7 +485,7 @@ static int rollback_verity(struct btrfs_inode *inode) goto out; } inode->ro_flags &= ~BTRFS_INODE_RO_VERITY; - btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + btrfs_sync_inode_flags_to_i_flags(inode); ret = btrfs_update_inode(trans, inode); if (ret) { btrfs_abort_transaction(trans, ret); @@ -552,7 +552,7 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc, goto out; } inode->ro_flags |= BTRFS_INODE_RO_VERITY; - btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + btrfs_sync_inode_flags_to_i_flags(inode); ret = btrfs_update_inode(trans, inode); if (ret) goto end_trans; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1cccaf9c2b0d..fa7a929a0461 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -13,12 +13,11 @@ #include <linux/list_sort.h> #include <linux/namei.h> #include "misc.h" -#include "ctree.h" #include "disk-io.h" +#include "extent-tree.h" #include "transaction.h" #include "volumes.h" #include "raid56.h" -#include "rcu-string.h" #include "dev-replace.h" #include "sysfs.h" #include "tree-checker.h" @@ -48,6 +47,7 @@ struct btrfs_io_geometry { u64 raid56_full_stripe_start; int max_errors; enum btrfs_map_op op; + bool use_rst; }; const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { @@ -213,10 +213,8 @@ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) u64 flags = bg_flags; u32 size_bp = size_buf; - if (!flags) { - strcpy(bp, "NONE"); + if (!flags) return; - } #define DESCRIBE_FLAG(flag, desc) \ do { \ @@ -402,8 +400,12 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) static void btrfs_free_device(struct btrfs_device *device) { WARN_ON(!list_empty(&device->post_commit_list)); - rcu_string_free(device->name); - extent_io_tree_release(&device->alloc_state); + /* + * No need to call kfree_rcu() nor do RCU lock/unlock, nothing is + * reading the device name. + */ + kfree(rcu_dereference_raw(device->name)); + btrfs_extent_io_tree_release(&device->alloc_state); btrfs_destroy_dev_zone_info(device); kfree(device); } @@ -413,9 +415,10 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) struct btrfs_device *device; WARN_ON(fs_devices->opened); + WARN_ON(fs_devices->holding); while (!list_empty(&fs_devices->devices)) { - device = list_entry(fs_devices->devices.next, - struct btrfs_device, dev_list); + device = list_first_entry(&fs_devices->devices, + struct btrfs_device, dev_list); list_del(&device->dev_list); btrfs_free_device(device); } @@ -427,8 +430,8 @@ void __exit btrfs_cleanup_fs_uuids(void) struct btrfs_fs_devices *fs_devices; while (!list_empty(&fs_uuids)) { - fs_devices = list_entry(fs_uuids.next, - struct btrfs_fs_devices, fs_list); + fs_devices = list_first_entry(&fs_uuids, struct btrfs_fs_devices, + fs_list); list_del(&fs_devices->fs_list); free_fs_devices(fs_devices); } @@ -472,7 +475,7 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, struct block_device *bdev; int ret; - *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL); + *bdev_file = bdev_file_open_by_path(device_path, flags, holder, &fs_holder_ops); if (IS_ERR(*bdev_file)) { ret = PTR_ERR(*bdev_file); @@ -487,15 +490,15 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, if (holder) { ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE); if (ret) { - fput(*bdev_file); + bdev_fput(*bdev_file); goto error; } } invalidate_bdev(bdev); - *disk_super = btrfs_read_dev_super(bdev); + *disk_super = btrfs_read_disk_super(bdev, 0, false); if (IS_ERR(*disk_super)) { ret = PTR_ERR(*disk_super); - fput(*bdev_file); + bdev_fput(*bdev_file); goto error; } @@ -540,7 +543,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device continue; if (devt && devt != device->devt) continue; - if (fs_devices->opened) { + if (fs_devices->opened || fs_devices->holding) { if (devt) ret = -EBUSY; break; @@ -656,7 +659,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, if (!device->name) return -EINVAL; - ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, + ret = btrfs_get_bdev_and_sb(rcu_dereference_raw(device->name), flags, holder, 1, &bdev_file, &disk_super); if (ret) return ret; @@ -673,8 +676,8 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { if (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { - pr_err( - "BTRFS: Invalid seeding and uuid-changed device detected\n"); + btrfs_err(NULL, + "invalid seeding and uuid-changed device detected"); goto error_free_page; } @@ -700,7 +703,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, if (device->devt != device->bdev->bd_dev) { btrfs_warn(NULL, "device %s maj:min changed from %d:%d to %d:%d", - device->name->str, MAJOR(device->devt), + rcu_dereference_raw(device->name), MAJOR(device->devt), MINOR(device->devt), MAJOR(device->bdev->bd_dev), MINOR(device->bdev->bd_dev)); @@ -719,7 +722,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, error_free_page: btrfs_release_disk_super(disk_super); - fput(bdev_file); + bdev_fput(bdev_file); return -EINVAL; } @@ -732,78 +735,6 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb) return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; } -/* - * We can have very weird soft links passed in. - * One example is "/proc/self/fd/<fd>", which can be a soft link to - * a block device. - * - * But it's never a good idea to use those weird names. - * Here we check if the path (not following symlinks) is a good one inside - * "/dev/". - */ -static bool is_good_dev_path(const char *dev_path) -{ - struct path path = { .mnt = NULL, .dentry = NULL }; - char *path_buf = NULL; - char *resolved_path; - bool is_good = false; - int ret; - - if (!dev_path) - goto out; - - path_buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!path_buf) - goto out; - - /* - * Do not follow soft link, just check if the original path is inside - * "/dev/". - */ - ret = kern_path(dev_path, 0, &path); - if (ret) - goto out; - resolved_path = d_path(&path, path_buf, PATH_MAX); - if (IS_ERR(resolved_path)) - goto out; - if (strncmp(resolved_path, "/dev/", strlen("/dev/"))) - goto out; - is_good = true; -out: - kfree(path_buf); - path_put(&path); - return is_good; -} - -static int get_canonical_dev_path(const char *dev_path, char *canonical) -{ - struct path path = { .mnt = NULL, .dentry = NULL }; - char *path_buf = NULL; - char *resolved_path; - int ret; - - if (!dev_path) { - ret = -EINVAL; - goto out; - } - - path_buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!path_buf) { - ret = -ENOMEM; - goto out; - } - - ret = kern_path(dev_path, LOOKUP_FOLLOW, &path); - if (ret) - goto out; - resolved_path = d_path(&path, path_buf, PATH_MAX); - ret = strscpy(canonical, resolved_path, PATH_MAX); -out: - kfree(path_buf); - path_put(&path); - return ret; -} - static bool is_same_device(struct btrfs_device *device, const char *new_path) { struct path old = { .mnt = NULL, .dentry = NULL }; @@ -820,7 +751,7 @@ static bool is_same_device(struct btrfs_device *device, const char *new_path) goto out; rcu_read_lock(); - ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX); + ret = strscpy(old_path, rcu_dereference(device->name), PATH_MAX); rcu_read_unlock(); if (ret < 0) goto out; @@ -853,11 +784,11 @@ static noinline struct btrfs_device *device_list_add(const char *path, { struct btrfs_device *device; struct btrfs_fs_devices *fs_devices = NULL; - struct rcu_string *name; + const char *name; u64 found_transid = btrfs_super_generation(disk_super); u64 devid = btrfs_stack_device_id(&disk_super->dev_item); dev_t path_devt; - int error; + int ret; bool same_fsid_diff_dev = false; bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID); @@ -869,11 +800,11 @@ static noinline struct btrfs_device *device_list_add(const char *path, return ERR_PTR(-EAGAIN); } - error = lookup_bdev(path, &path_devt); - if (error) { + ret = lookup_bdev(path, &path_devt); + if (ret) { btrfs_err(NULL, "failed to lookup block device for path %s: %d", - path, error); - return ERR_PTR(error); + path, ret); + return ERR_PTR(ret); } fs_devices = find_fsid_by_device(disk_super, path_devt, &same_fsid_diff_dev); @@ -890,7 +821,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (same_fsid_diff_dev) { generate_random_uuid(fs_devices->fsid); fs_devices->temp_fsid = true; - pr_info("BTRFS: device %s (%d:%d) using temp-fsid %pU\n", + btrfs_info(NULL, "device %s (%d:%d) using temp-fsid %pU", path, MAJOR(path_devt), MINOR(path_devt), fs_devices->fsid); } @@ -961,6 +892,8 @@ static noinline struct btrfs_device *device_list_add(const char *path, current->comm, task_pid_nr(current)); } else if (!device->name || !is_same_device(device, path)) { + const char *old_name; + /* * When FS is already mounted. * 1. If you are here and if the device->name is NULL that @@ -1014,27 +947,31 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (device->bdev) { if (device->devt != path_devt) { mutex_unlock(&fs_devices->device_list_mutex); - btrfs_warn_in_rcu(NULL, + btrfs_warn(NULL, "duplicate device %s devid %llu generation %llu scanned by %s (%d)", path, devid, found_transid, current->comm, task_pid_nr(current)); return ERR_PTR(-EEXIST); } - btrfs_info_in_rcu(NULL, + btrfs_info(NULL, "devid %llu device path %s changed to %s scanned by %s (%d)", devid, btrfs_dev_name(device), path, current->comm, task_pid_nr(current)); } - name = rcu_string_strdup(path, GFP_NOFS); + name = kstrdup(path, GFP_NOFS); if (!name) { mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-ENOMEM); } - rcu_string_free(device->name); + rcu_read_lock(); + old_name = rcu_dereference(device->name); + rcu_read_unlock(); rcu_assign_pointer(device->name, name); + kfree_rcu_mightsleep(old_name); + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { fs_devices->missing_devices--; clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); @@ -1083,7 +1020,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) * uuid mutex so nothing we touch in here is going to disappear. */ if (orig_dev->name) - dev_path = orig_dev->name->str; + dev_path = rcu_dereference_raw(orig_dev->name); device = btrfs_alloc_device(NULL, &orig_dev->devid, orig_dev->uuid, dev_path); @@ -1141,7 +1078,7 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, continue; if (device->bdev_file) { - fput(device->bdev_file); + bdev_fput(device->bdev_file); device->bdev = NULL; device->bdev_file = NULL; fs_devices->open_devices--; @@ -1188,7 +1125,7 @@ static void btrfs_close_bdev(struct btrfs_device *device) invalidate_bdev(device->bdev); } - fput(device->bdev_file); + bdev_fput(device->bdev_file); } static void btrfs_close_one_device(struct btrfs_device *device) @@ -1220,7 +1157,7 @@ static void btrfs_close_one_device(struct btrfs_device *device) device->fs_info = NULL; atomic_set(&device->dev_stats_ccnt, 0); - extent_io_tree_release(&device->alloc_state); + btrfs_extent_io_tree_release(&device->alloc_state); /* * Reset the flush error record. We might have a transient flush error @@ -1268,7 +1205,7 @@ void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) mutex_lock(&uuid_mutex); close_fs_devices(fs_devices); - if (!fs_devices->opened) { + if (!fs_devices->opened && !fs_devices->holding) { list_splice_init(&fs_devices->seed_list, &list); /* @@ -1298,6 +1235,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; struct btrfs_device *tmp_device; + s64 __maybe_unused value = 0; int ret = 0; list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, @@ -1327,7 +1265,23 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, fs_devices->latest_dev = latest_dev; fs_devices->total_rw_bytes = 0; fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; + fs_devices->read_devid = latest_dev->devid; + fs_devices->read_policy = btrfs_read_policy_to_enum(btrfs_get_mod_read_policy(), + &value); + if (fs_devices->read_policy == BTRFS_READ_POLICY_RR) + fs_devices->collect_fs_stats = true; + + if (value) { + if (fs_devices->read_policy == BTRFS_READ_POLICY_RR) + fs_devices->rr_min_contig_read = value; + if (fs_devices->read_policy == BTRFS_READ_POLICY_DEVID) + fs_devices->read_devid = value; + } +#else fs_devices->read_policy = BTRFS_READ_POLICY_PID; +#endif return 0; } @@ -1379,48 +1333,58 @@ void btrfs_release_disk_super(struct btrfs_super_block *super) put_page(page); } -static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, - u64 bytenr, u64 bytenr_orig) +struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, + int copy_num, bool drop_cache) { - struct btrfs_super_block *disk_super; + struct btrfs_super_block *super; struct page *page; - void *p; - pgoff_t index; + u64 bytenr, bytenr_orig; + struct address_space *mapping = bdev->bd_mapping; + int ret; - /* make sure our super fits in the device */ - if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev)) - return ERR_PTR(-EINVAL); + bytenr_orig = btrfs_sb_offset(copy_num); + ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr); + if (ret < 0) { + if (ret == -ENOENT) + ret = -EINVAL; + return ERR_PTR(ret); + } - /* make sure our super fits in the page */ - if (sizeof(*disk_super) > PAGE_SIZE) + if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev)) return ERR_PTR(-EINVAL); - /* make sure our super doesn't straddle pages on disk */ - index = bytenr >> PAGE_SHIFT; - if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) - return ERR_PTR(-EINVAL); + if (drop_cache) { + /* This should only be called with the primary sb. */ + ASSERT(copy_num == 0); - /* pull in the page with our super */ - page = read_cache_page_gfp(bdev->bd_mapping, index, GFP_KERNEL); + /* + * Drop the page of the primary superblock, so later read will + * always read from the device. + */ + invalidate_inode_pages2_range(mapping, bytenr >> PAGE_SHIFT, + (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT); + } + page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); if (IS_ERR(page)) return ERR_CAST(page); - p = page_address(page); - - /* align our pointer to the offset of the super block */ - disk_super = p + offset_in_page(bytenr); - - if (btrfs_super_bytenr(disk_super) != bytenr_orig || - btrfs_super_magic(disk_super) != BTRFS_MAGIC) { - btrfs_release_disk_super(p); + super = page_address(page); + if (btrfs_super_magic(super) != BTRFS_MAGIC || + btrfs_super_bytenr(super) != bytenr_orig) { + btrfs_release_disk_super(super); return ERR_PTR(-EINVAL); } - if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) - disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; + /* + * Make sure the last byte of label is properly NUL termiated. We use + * '%s' to print the label, if not properly NUL termiated we can access + * beyond the label. + */ + if (super->label[0] && super->label[BTRFS_LABEL_SIZE - 1]) + super->label[BTRFS_LABEL_SIZE - 1] = 0; - return disk_super; + return super; } int btrfs_forget_devices(dev_t devt) @@ -1458,7 +1422,7 @@ static bool btrfs_skip_registration(struct btrfs_super_block *disk_super, list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->bdev && (device->bdev->bd_dev == devt) && - strcmp(device->name->str, path) != 0) { + strcmp(rcu_dereference_raw(device->name), path) != 0) { mutex_unlock(&fs_devices->device_list_mutex); /* Do not skip registration. */ @@ -1484,30 +1448,17 @@ static bool btrfs_skip_registration(struct btrfs_super_block *disk_super, * the device or return an error. Multi-device and seeding devices are registered * in both cases. */ -struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, +struct btrfs_device *btrfs_scan_one_device(const char *path, bool mount_arg_dev) { struct btrfs_super_block *disk_super; bool new_device_added = false; struct btrfs_device *device = NULL; struct file *bdev_file; - char *canonical_path = NULL; - u64 bytenr; dev_t devt; - int ret; lockdep_assert_held(&uuid_mutex); - if (!is_good_dev_path(path)) { - canonical_path = kmalloc(PATH_MAX, GFP_KERNEL); - if (canonical_path) { - ret = get_canonical_dev_path(path, canonical_path); - if (ret < 0) { - kfree(canonical_path); - canonical_path = NULL; - } - } - } /* * Avoid an exclusive open here, as the systemd-udev may initiate the * device scan which may race with the user's mount or mkfs command, @@ -1518,24 +1469,11 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, * values temporarily, as the device paths of the fsid are the only * required information for assembling the volume. */ - bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL); + bdev_file = bdev_file_open_by_path(path, BLK_OPEN_READ, NULL, NULL); if (IS_ERR(bdev_file)) return ERR_CAST(bdev_file); - /* - * We would like to check all the super blocks, but doing so would - * allow a mount to succeed after a mkfs from a different filesystem. - * Currently, recovery from a bad primary btrfs superblock is done - * using the userspace command 'btrfs check --super'. - */ - ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr); - if (ret) { - device = ERR_PTR(ret); - goto error_bdev_put; - } - - disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr, - btrfs_sb_offset(0)); + disk_super = btrfs_read_disk_super(file_bdev(bdev_file), 0, false); if (IS_ERR(disk_super)) { device = ERR_CAST(disk_super); goto error_bdev_put; @@ -1543,7 +1481,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, devt = file_bdev(bdev_file)->bd_dev; if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) { - pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n", + btrfs_debug(NULL, "skip registering single non-seed device %s (%d:%d)", path, MAJOR(devt), MINOR(devt)); btrfs_free_stale_devices(devt, NULL); @@ -1552,8 +1490,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, goto free_disk_super; } - device = device_list_add(canonical_path ? : path, disk_super, - &new_device_added); + device = device_list_add(path, disk_super, &new_device_added); if (!IS_ERR(device) && new_device_added) btrfs_free_stale_devices(device->devt, device); @@ -1561,8 +1498,7 @@ free_disk_super: btrfs_release_disk_super(disk_super); error_bdev_put: - fput(bdev_file); - kfree(canonical_path); + bdev_fput(bdev_file); return device; } @@ -1578,9 +1514,9 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, lockdep_assert_held(&device->fs_info->chunk_mutex); - if (find_first_extent_bit(&device->alloc_state, *start, - &physical_start, &physical_end, - CHUNK_ALLOCATED, NULL)) { + if (btrfs_find_first_extent_bit(&device->alloc_state, *start, + &physical_start, &physical_end, + CHUNK_ALLOCATED, NULL)) { if (in_range(physical_start, *start, len) || in_range(*start, physical_start, @@ -1595,6 +1531,9 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, static u64 dev_extent_search_start(struct btrfs_device *device) { switch (device->fs_devices->chunk_alloc_policy) { + default: + btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy); + fallthrough; case BTRFS_CHUNK_ALLOC_REGULAR: return BTRFS_DEVICE_RANGE_RESERVED; case BTRFS_CHUNK_ALLOC_ZONED: @@ -1604,8 +1543,6 @@ static u64 dev_extent_search_start(struct btrfs_device *device) * for superblock logging. */ return 0; - default: - BUG(); } } @@ -1618,7 +1555,8 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device, int ret; bool changed = false; - ASSERT(IS_ALIGNED(*hole_start, zone_size)); + ASSERT(IS_ALIGNED(*hole_start, zone_size), + "hole_start=%llu zone_size=%llu", *hole_start, zone_size); while (*hole_size > 0) { pos = btrfs_find_allocatable_zones(device, *hole_start, @@ -1684,6 +1622,9 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, } switch (device->fs_devices->chunk_alloc_policy) { + default: + btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy); + fallthrough; case BTRFS_CHUNK_ALLOC_REGULAR: /* No extra check */ break; @@ -1698,8 +1639,6 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, continue; } break; - default: - BUG(); } break; @@ -1776,8 +1715,8 @@ again: path->skip_locking = 1; key.objectid = device->devid; - key.offset = search_start; key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = search_start; ret = btrfs_search_backwards(root, &key, path); if (ret < 0) @@ -1869,7 +1808,9 @@ next: else ret = 0; - ASSERT(max_hole_start + max_hole_size <= search_end); + ASSERT(max_hole_start + max_hole_size <= search_end, + "max_hole_start=%llu max_hole_size=%llu search_end=%llu", + max_hole_start, max_hole_size, search_end); out: btrfs_free_path(path); *start = max_hole_start; @@ -1896,8 +1837,8 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, return -ENOMEM; key.objectid = device->devid; - key.offset = start; key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = start; again: ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { @@ -2045,7 +1986,6 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, ptr = btrfs_device_fsid(dev_item); write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, ptr, BTRFS_FSID_SIZE); - btrfs_mark_buffer_dirty(trans, leaf); ret = 0; out: @@ -2183,7 +2123,7 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) down_read(&fs_info->dev_replace.rwsem); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { - ASSERT(num_devices > 1); + ASSERT(num_devices > 1, "num_devices=%llu", num_devices); num_devices--; } up_read(&fs_info->dev_replace.rwsem); @@ -2199,7 +2139,7 @@ static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info, const u64 bytenr = btrfs_sb_offset(copy_num); int ret; - disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr); + disk_super = btrfs_read_disk_super(bdev, copy_num, false); if (IS_ERR(disk_super)) return; @@ -2232,7 +2172,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_devic btrfs_kobject_uevent(bdev, KOBJ_CHANGE); /* Update ctime/mtime for device path for libblkid */ - update_dev_time(device->name->str); + update_dev_time(rcu_dereference_raw(device->name)); } int btrfs_rm_device(struct btrfs_fs_info *fs_info, @@ -2272,7 +2212,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, } if (btrfs_pinned_by_swapfile(fs_info, device)) { - btrfs_warn_in_rcu(fs_info, + btrfs_warn(fs_info, "cannot remove device %s (devid %llu) due to active swapfile", btrfs_dev_name(device), device->devid); return -ETXTBSY; @@ -2362,7 +2302,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, * free the device. * * We cannot call btrfs_close_bdev() here because we're holding the sb - * write lock, and fput() on the block device will pull in the + * write lock, and bdev_fput() on the block device will pull in the * ->open_mutex on the block device and it's dependencies. Instead * just flush the device and let the caller do the final bdev_release. */ @@ -2387,7 +2327,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, */ if (cur_devices->num_devices == 0) { list_del_init(&cur_devices->seed_list); - ASSERT(cur_devices->opened == 1); + ASSERT(cur_devices->opened == 1, "opened=%d", cur_devices->opened); cur_devices->opened--; free_fs_devices(cur_devices); } @@ -2541,7 +2481,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, else memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); btrfs_release_disk_super(disk_super); - fput(bdev_file); + bdev_fput(bdev_file); return 0; } @@ -2700,8 +2640,8 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) return -ENOMEM; key.objectid = BTRFS_DEV_ITEMS_OBJECTID; - key.offset = 0; key.type = BTRFS_DEV_ITEM_KEY; + key.offset = 0; while (1) { btrfs_reserve_chunk_metadata(trans, false); @@ -2741,11 +2681,9 @@ next_slot: device = btrfs_find_device(fs_info->fs_devices, &args); BUG_ON(!device); /* Logic error */ - if (device->fs_devices->seeding) { + if (device->fs_devices->seeding) btrfs_set_device_generation(leaf, dev_item, device->generation); - btrfs_mark_buffer_dirty(trans, leaf); - } path->slots[0]++; goto next_slot; @@ -2775,7 +2713,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path return -EROFS; bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, - fs_info->bdev_holder, NULL); + fs_info->sb, &fs_holder_ops); if (IS_ERR(bdev_file)) return PTR_ERR(bdev_file); @@ -2991,7 +2929,7 @@ error_free_zone: error_free_device: btrfs_free_device(device); error: - fput(bdev_file); + bdev_fput(bdev_file); if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); @@ -3038,8 +2976,6 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, btrfs_device_get_disk_total_bytes(device)); btrfs_set_device_bytes_used(leaf, dev_item, btrfs_device_get_bytes_used(device)); - btrfs_mark_buffer_dirty(trans, leaf); - out: btrfs_free_path(path); return ret; @@ -3102,8 +3038,8 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) return -ENOMEM; key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - key.offset = chunk_offset; key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = chunk_offset; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) @@ -3321,7 +3257,8 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) * user having built with ASSERT enabled, so if ASSERT doesn't * do anything we still error out. */ - ASSERT(0); + DEBUG_WARN("errr %ld reading chunk map at offset %llu", + PTR_ERR(map), chunk_offset); return PTR_ERR(map); } @@ -3353,6 +3290,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) device->bytes_used - dev_extent_len); atomic64_add(dev_extent_len, &fs_info->free_chunk_space); btrfs_clear_space_info_full(fs_info); + + if (list_empty(&device->post_commit_list)) { + list_add_tail(&device->post_commit_list, + &trans->transaction->dev_update_list); + } + mutex_unlock(&fs_info->chunk_mutex); } } @@ -3402,8 +3345,16 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) if (ret == -ENOSPC) { const u64 sys_flags = btrfs_system_alloc_profile(fs_info); struct btrfs_block_group *sys_bg; + struct btrfs_space_info *space_info; + + space_info = btrfs_find_space_info(fs_info, sys_flags); + if (!space_info) { + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } - sys_bg = btrfs_create_chunk(trans, sys_flags); + sys_bg = btrfs_create_chunk(trans, space_info, sys_flags); if (IS_ERR(sys_bg)) { ret = PTR_ERR(sys_bg); btrfs_abort_transaction(trans, ret); @@ -3461,7 +3412,8 @@ out: return ret; } -int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) +int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset, + bool verbose) { struct btrfs_root *root = fs_info->chunk_root; struct btrfs_trans_handle *trans; @@ -3491,7 +3443,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) /* step one, relocate all the extents inside this chunk */ btrfs_scrub_pause(fs_info); - ret = btrfs_relocate_block_group(fs_info, chunk_offset); + ret = btrfs_relocate_block_group(fs_info, chunk_offset, true); btrfs_scrub_continue(fs_info); if (ret) { /* @@ -3560,8 +3512,8 @@ static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) again: key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = (u64)-1; while (1) { mutex_lock(&fs_info->reclaim_bgs_lock); @@ -3601,7 +3553,8 @@ again: btrfs_release_path(path); if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { - ret = btrfs_relocate_chunk(fs_info, found_key.offset); + ret = btrfs_relocate_chunk(fs_info, found_key.offset, + true); if (ret == -ENOSPC) failed++; else @@ -3748,10 +3701,7 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info, btrfs_set_balance_meta(leaf, item, &disk_bargs); btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); btrfs_set_balance_sys(leaf, item, &disk_bargs); - btrfs_set_balance_flags(leaf, item, bctl->flags); - - btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); err = btrfs_commit_transaction(trans); @@ -3866,26 +3816,25 @@ static void reset_balance_state(struct btrfs_fs_info *fs_info) * Balance filters. Return 1 if chunk should be filtered out * (should not be balanced). */ -static int chunk_profiles_filter(u64 chunk_type, - struct btrfs_balance_args *bargs) +static bool chunk_profiles_filter(u64 chunk_type, struct btrfs_balance_args *bargs) { chunk_type = chunk_to_extended(chunk_type) & BTRFS_EXTENDED_PROFILE_MASK; if (bargs->profiles & chunk_type) - return 0; + return false; - return 1; + return true; } -static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, - struct btrfs_balance_args *bargs) +static bool chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, + struct btrfs_balance_args *bargs) { struct btrfs_block_group *cache; u64 chunk_used; u64 user_thresh_min; u64 user_thresh_max; - int ret = 1; + bool ret = true; cache = btrfs_lookup_block_group(fs_info, chunk_offset); chunk_used = cache->used; @@ -3903,18 +3852,18 @@ static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_off user_thresh_max = mult_perc(cache->length, bargs->usage_max); if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) - ret = 0; + ret = false; btrfs_put_block_group(cache); return ret; } -static int chunk_usage_filter(struct btrfs_fs_info *fs_info, - u64 chunk_offset, struct btrfs_balance_args *bargs) +static bool chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, + struct btrfs_balance_args *bargs) { struct btrfs_block_group *cache; u64 chunk_used, user_thresh; - int ret = 1; + bool ret = true; cache = btrfs_lookup_block_group(fs_info, chunk_offset); chunk_used = cache->used; @@ -3927,15 +3876,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, user_thresh = mult_perc(cache->length, bargs->usage); if (chunk_used < user_thresh) - ret = 0; + ret = false; btrfs_put_block_group(cache); return ret; } -static int chunk_devid_filter(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, - struct btrfs_balance_args *bargs) +static bool chunk_devid_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) { struct btrfs_stripe *stripe; int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); @@ -3944,10 +3892,10 @@ static int chunk_devid_filter(struct extent_buffer *leaf, for (i = 0; i < num_stripes; i++) { stripe = btrfs_stripe_nr(chunk, i); if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) - return 0; + return false; } - return 1; + return true; } static u64 calc_data_stripes(u64 type, int num_stripes) @@ -3960,9 +3908,8 @@ static u64 calc_data_stripes(u64 type, int num_stripes) } /* [pstart, pend) */ -static int chunk_drange_filter(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, - struct btrfs_balance_args *bargs) +static bool chunk_drange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) { struct btrfs_stripe *stripe; int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); @@ -3973,7 +3920,7 @@ static int chunk_drange_filter(struct extent_buffer *leaf, int i; if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) - return 0; + return false; type = btrfs_chunk_type(leaf, chunk); factor = calc_data_stripes(type, num_stripes); @@ -3989,56 +3936,53 @@ static int chunk_drange_filter(struct extent_buffer *leaf, if (stripe_offset < bargs->pend && stripe_offset + stripe_length > bargs->pstart) - return 0; + return false; } - return 1; + return true; } /* [vstart, vend) */ -static int chunk_vrange_filter(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, - u64 chunk_offset, - struct btrfs_balance_args *bargs) +static bool chunk_vrange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, + u64 chunk_offset, struct btrfs_balance_args *bargs) { if (chunk_offset < bargs->vend && chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) /* at least part of the chunk is inside this vrange */ - return 0; + return false; - return 1; + return true; } -static int chunk_stripes_range_filter(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, - struct btrfs_balance_args *bargs) +static bool chunk_stripes_range_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) { int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); if (bargs->stripes_min <= num_stripes && num_stripes <= bargs->stripes_max) - return 0; + return false; - return 1; + return true; } -static int chunk_soft_convert_filter(u64 chunk_type, - struct btrfs_balance_args *bargs) +static bool chunk_soft_convert_filter(u64 chunk_type, struct btrfs_balance_args *bargs) { if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) - return 0; + return false; chunk_type = chunk_to_extended(chunk_type) & BTRFS_EXTENDED_PROFILE_MASK; if (bargs->target == chunk_type) - return 1; + return true; - return 0; + return false; } -static int should_balance_chunk(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, u64 chunk_offset) +static bool should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk *chunk, + u64 chunk_offset) { struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_balance_control *bctl = fs_info->balance_ctl; @@ -4048,7 +3992,7 @@ static int should_balance_chunk(struct extent_buffer *leaf, /* type filter */ if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { - return 0; + return false; } if (chunk_type & BTRFS_BLOCK_GROUP_DATA) @@ -4061,46 +4005,46 @@ static int should_balance_chunk(struct extent_buffer *leaf, /* profiles filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && chunk_profiles_filter(chunk_type, bargs)) { - return 0; + return false; } /* usage filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && chunk_usage_filter(fs_info, chunk_offset, bargs)) { - return 0; + return false; } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { - return 0; + return false; } /* devid filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && chunk_devid_filter(leaf, chunk, bargs)) { - return 0; + return false; } /* drange filter, makes sense only with devid filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && chunk_drange_filter(leaf, chunk, bargs)) { - return 0; + return false; } /* vrange filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { - return 0; + return false; } /* stripes filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && chunk_stripes_range_filter(leaf, chunk, bargs)) { - return 0; + return false; } /* soft profile changing mode */ if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && chunk_soft_convert_filter(chunk_type, bargs)) { - return 0; + return false; } /* @@ -4108,7 +4052,7 @@ static int should_balance_chunk(struct extent_buffer *leaf, */ if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { if (bargs->limit == 0) - return 0; + return false; else bargs->limit--; } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { @@ -4118,12 +4062,12 @@ static int should_balance_chunk(struct extent_buffer *leaf, * about the count of all chunks that satisfy the filters. */ if (bargs->limit_max == 0) - return 0; + return false; else bargs->limit_max--; } - return 1; + return true; } static int __btrfs_balance(struct btrfs_fs_info *fs_info) @@ -4170,8 +4114,8 @@ again: bctl->sys.limit = limit_sys; } key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = (u64)-1; while (1) { if ((!counting && atomic_read(&fs_info->balance_pause_req)) || @@ -4275,7 +4219,7 @@ again: } } - ret = btrfs_relocate_chunk(fs_info, found_key.offset); + ret = btrfs_relocate_chunk(fs_info, found_key.offset, true); mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret == -ENOSPC) { enospc_errors++; @@ -4738,7 +4682,8 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) } spin_lock(&fs_info->super_lock); - ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED, + "exclusive_operation=%d", fs_info->exclusive_operation); fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; spin_unlock(&fs_info->super_lock); /* @@ -4987,8 +4932,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) again: key.objectid = device->devid; - key.offset = (u64)-1; key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = (u64)-1; do { mutex_lock(&fs_info->reclaim_bgs_lock); @@ -5042,7 +4987,7 @@ again: goto done; } - ret = btrfs_relocate_chunk(fs_info, chunk_offset); + ret = btrfs_relocate_chunk(fs_info, chunk_offset, true); mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret == -ENOSPC) { failed++; @@ -5074,8 +5019,8 @@ again: mutex_lock(&fs_info->chunk_mutex); /* Clear all state bits beyond the shrunk device size */ - clear_extent_bits(&device->alloc_state, new_size, (u64)-1, - CHUNK_STATE_MASK); + btrfs_clear_extent_bit(&device->alloc_state, new_size, (u64)-1, + CHUNK_STATE_MASK, NULL); btrfs_device_set_disk_total_bytes(device, new_size); if (list_empty(&device->post_commit_list)) @@ -5202,6 +5147,8 @@ struct alloc_chunk_ctl { u64 stripe_size; u64 chunk_size; int ndevs; + /* Space_info the block group is going to belong. */ + struct btrfs_space_info *space_info; }; static void init_alloc_chunk_ctl_policy_regular( @@ -5275,14 +5222,15 @@ static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, ctl->ndevs = 0; switch (fs_devices->chunk_alloc_policy) { + default: + btrfs_warn_unknown_chunk_allocation(fs_devices->chunk_alloc_policy); + fallthrough; case BTRFS_CHUNK_ALLOC_REGULAR: init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); break; case BTRFS_CHUNK_ALLOC_ZONED: init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); break; - default: - BUG(); } } @@ -5421,7 +5369,9 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, * It should hold because: * dev_extent_min == dev_extent_want == zone_size * dev_stripes */ - ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); + ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min, + "ndevs=%d max_avail=%llu dev_extent_min=%llu", ctl->ndevs, + devices_info[ctl->ndevs - 1].max_avail, ctl->dev_extent_min); ctl->stripe_size = zone_size; ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; @@ -5434,7 +5384,9 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, ctl->dev_stripes); ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; - ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); + ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size, + "stripe_size=%llu data_stripes=%d max_chunk_size=%llu", + ctl->stripe_size, data_stripes, ctl->max_chunk_size); } ctl->chunk_size = ctl->stripe_size * data_stripes; @@ -5467,12 +5419,13 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, ctl->ndevs = min(ctl->ndevs, ctl->devs_max); switch (fs_devices->chunk_alloc_policy) { + default: + btrfs_warn_unknown_chunk_allocation(fs_devices->chunk_alloc_policy); + fallthrough; case BTRFS_CHUNK_ALLOC_REGULAR: return decide_stripe_size_regular(ctl, devices_info); case BTRFS_CHUNK_ALLOC_ZONED: return decide_stripe_size_zoned(ctl, devices_info); - default: - BUG(); } } @@ -5482,9 +5435,9 @@ static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int struct btrfs_io_stripe *stripe = &map->stripes[i]; struct btrfs_device *device = stripe->dev; - set_extent_bit(&device->alloc_state, stripe->physical, - stripe->physical + map->stripe_size - 1, - bits | EXTENT_NOWAIT, NULL); + btrfs_set_extent_bit(&device->alloc_state, stripe->physical, + stripe->physical + map->stripe_size - 1, + bits | EXTENT_NOWAIT, NULL); } } @@ -5494,10 +5447,9 @@ static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned in struct btrfs_io_stripe *stripe = &map->stripes[i]; struct btrfs_device *device = stripe->dev; - __clear_extent_bit(&device->alloc_state, stripe->physical, - stripe->physical + map->stripe_size - 1, - bits | EXTENT_NOWAIT, - NULL, NULL); + btrfs_clear_extent_bit(&device->alloc_state, stripe->physical, + stripe->physical + map->stripe_size - 1, + bits | EXTENT_NOWAIT, NULL); } } @@ -5513,33 +5465,34 @@ void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_ma btrfs_free_chunk_map(map); } +static int btrfs_chunk_map_cmp(const struct rb_node *new, + const struct rb_node *exist) +{ + const struct btrfs_chunk_map *new_map = + rb_entry(new, struct btrfs_chunk_map, rb_node); + const struct btrfs_chunk_map *exist_map = + rb_entry(exist, struct btrfs_chunk_map, rb_node); + + if (new_map->start == exist_map->start) + return 0; + if (new_map->start < exist_map->start) + return -1; + return 1; +} + EXPORT_FOR_TESTS int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map) { - struct rb_node **p; - struct rb_node *parent = NULL; - bool leftmost = true; + struct rb_node *exist; write_lock(&fs_info->mapping_tree_lock); - p = &fs_info->mapping_tree.rb_root.rb_node; - while (*p) { - struct btrfs_chunk_map *entry; - - parent = *p; - entry = rb_entry(parent, struct btrfs_chunk_map, rb_node); - - if (map->start < entry->start) { - p = &(*p)->rb_left; - } else if (map->start > entry->start) { - p = &(*p)->rb_right; - leftmost = false; - } else { - write_unlock(&fs_info->mapping_tree_lock); - return -EEXIST; - } + exist = rb_find_add_cached(&map->rb_node, &fs_info->mapping_tree, + btrfs_chunk_map_cmp); + + if (exist) { + write_unlock(&fs_info->mapping_tree_lock); + return -EEXIST; } - rb_link_node(&map->rb_node, parent, p); - rb_insert_color_cached(&map->rb_node, &fs_info->mapping_tree, leftmost); chunk_map_device_set_bits(map, CHUNK_ALLOCATED); chunk_map_device_clear_bits(map, CHUNK_TRIMMED); write_unlock(&fs_info->mapping_tree_lock); @@ -5603,7 +5556,8 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, return ERR_PTR(ret); } - block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size); + block_group = btrfs_make_block_group(trans, ctl->space_info, type, start, + ctl->chunk_size); if (IS_ERR(block_group)) { btrfs_remove_chunk_map(info, map); return block_group; @@ -5629,7 +5583,8 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, } struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, - u64 type) + struct btrfs_space_info *space_info, + u64 type) { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_fs_devices *fs_devices = info->fs_devices; @@ -5641,7 +5596,7 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, lockdep_assert_held(&info->chunk_mutex); if (!alloc_profile_is_valid(type, 0)) { - ASSERT(0); + DEBUG_WARN("invalid alloc profile for type %llu", type); return ERR_PTR(-EINVAL); } @@ -5653,12 +5608,13 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { btrfs_err(info, "invalid chunk type 0x%llx requested", type); - ASSERT(0); + DEBUG_WARN(); return ERR_PTR(-EINVAL); } ctl.start = find_next_chunk(info); ctl.type = type; + ctl.space_info = space_info; init_alloc_chunk_ctl(fs_devices, &ctl); devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), @@ -5802,7 +5758,9 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; u64 alloc_profile; struct btrfs_block_group *meta_bg; + struct btrfs_space_info *meta_space_info; struct btrfs_block_group *sys_bg; + struct btrfs_space_info *sys_space_info; /* * When adding a new device for sprouting, the seed device is read-only @@ -5826,12 +5784,22 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) */ alloc_profile = btrfs_metadata_alloc_profile(fs_info); - meta_bg = btrfs_create_chunk(trans, alloc_profile); + meta_space_info = btrfs_find_space_info(fs_info, alloc_profile); + if (!meta_space_info) { + DEBUG_WARN(); + return -EINVAL; + } + meta_bg = btrfs_create_chunk(trans, meta_space_info, alloc_profile); if (IS_ERR(meta_bg)) return PTR_ERR(meta_bg); alloc_profile = btrfs_system_alloc_profile(fs_info); - sys_bg = btrfs_create_chunk(trans, alloc_profile); + sys_space_info = btrfs_find_space_info(fs_info, alloc_profile); + if (!sys_space_info) { + DEBUG_WARN(); + return -EINVAL; + } + sys_bg = btrfs_create_chunk(trans, sys_space_info, alloc_profile); if (IS_ERR(sys_bg)) return PTR_ERR(sys_bg); @@ -5959,9 +5927,79 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, return len; } +#ifdef CONFIG_BTRFS_EXPERIMENTAL +static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first, int num_stripes) +{ + for (int index = first; index < first + num_stripes; index++) { + const struct btrfs_device *device = map->stripes[index].dev; + + if (device->devid == READ_ONCE(device->fs_devices->read_devid)) + return index; + } + + /* If no read-preferred device is set use the first stripe. */ + return first; +} + +struct stripe_mirror { + u64 devid; + int num; +}; + +static int btrfs_cmp_devid(const void *a, const void *b) +{ + const struct stripe_mirror *s1 = (const struct stripe_mirror *)a; + const struct stripe_mirror *s2 = (const struct stripe_mirror *)b; + + if (s1->devid < s2->devid) + return -1; + if (s1->devid > s2->devid) + return 1; + return 0; +} + +/* + * Select a stripe for reading using the round-robin algorithm. + * + * 1. Compute the read cycle as the total sectors read divided by the minimum + * sectors per device. + * 2. Determine the stripe number for the current read by taking the modulus + * of the read cycle with the total number of stripes: + * + * stripe index = (total sectors / min sectors per dev) % num stripes + * + * The calculated stripe index is then used to select the corresponding device + * from the list of devices, which is ordered by devid. + */ +static int btrfs_read_rr(const struct btrfs_chunk_map *map, int first, int num_stripes) +{ + struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = { 0 }; + struct btrfs_device *device = map->stripes[first].dev; + struct btrfs_fs_info *fs_info = device->fs_devices->fs_info; + unsigned int read_cycle; + unsigned int total_reads; + unsigned int min_reads_per_dev; + + total_reads = percpu_counter_sum(&fs_info->stats_read_blocks); + min_reads_per_dev = READ_ONCE(fs_info->fs_devices->rr_min_contig_read) >> + fs_info->sectorsize_bits; + + for (int index = 0, i = first; i < first + num_stripes; i++) { + stripes[index].devid = map->stripes[i].dev->devid; + stripes[index].num = i; + index++; + } + sort(stripes, num_stripes, sizeof(struct stripe_mirror), + btrfs_cmp_devid, NULL); + + read_cycle = total_reads / min_reads_per_dev; + return stripes[read_cycle % num_stripes].num; +} +#endif + static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, int first, - int dev_replace_is_ongoing) + bool dev_replace_is_ongoing) { const enum btrfs_read_policy policy = READ_ONCE(fs_info->fs_devices->read_policy); int i; @@ -5970,8 +6008,8 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, int tolerance; struct btrfs_device *srcdev; - ASSERT((map->type & - (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); + ASSERT((map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)), + "type=%llu", map->type); if (map->type & BTRFS_BLOCK_GROUP_RAID10) num_stripes = map->sub_stripes; @@ -5988,6 +6026,14 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, case BTRFS_READ_POLICY_PID: preferred_mirror = first + (current->pid % num_stripes); break; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + case BTRFS_READ_POLICY_RR: + preferred_mirror = btrfs_read_rr(map, first, num_stripes); + break; + case BTRFS_READ_POLICY_DEVID: + preferred_mirror = btrfs_read_preferred(map, first, num_stripes); + break; +#endif } if (dev_replace_is_ongoing && @@ -6264,7 +6310,7 @@ static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc, } /* We can only have at most 2 extra nr_stripes (for DUP). */ - ASSERT(nr_extra_stripes <= 2); + ASSERT(nr_extra_stripes <= 2, "nr_extra_stripes=%d", nr_extra_stripes); /* * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for * replace. @@ -6275,7 +6321,8 @@ static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc, struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1]; /* Only DUP can have two extra stripes. */ - ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP); + ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP, + "map_type=%llu", bioc->map_type); /* * Swap the last stripe stripes and reduce @nr_extra_stripes. @@ -6302,7 +6349,8 @@ static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset, */ io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK; io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; - ASSERT(io_geom->stripe_offset < U32_MAX); + ASSERT(io_geom->stripe_offset < U32_MAX, + "stripe_offset=%llu", io_geom->stripe_offset); if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { unsigned long full_stripe_len = @@ -6320,8 +6368,12 @@ static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset, io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset( rounddown(io_geom->stripe_nr, nr_data_stripes(map))); - ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset); - ASSERT(io_geom->raid56_full_stripe_start <= offset); + ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset, + "raid56_full_stripe_start=%llu full_stripe_len=%lu offset=%llu", + io_geom->raid56_full_stripe_start, full_stripe_len, offset); + ASSERT(io_geom->raid56_full_stripe_start <= offset, + "raid56_full_stripe_start=%llu offset=%llu", + io_geom->raid56_full_stripe_start, offset); /* * For writes to RAID56, allow to write a full stripe set, but * no straddling of stripe sets. @@ -6346,8 +6398,7 @@ static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical, { dst->dev = map->stripes[io_geom->stripe_index].dev; - if (io_geom->op == BTRFS_MAP_READ && - btrfs_need_stripe_tree_update(fs_info, map->type)) + if (io_geom->op == BTRFS_MAP_READ && io_geom->use_rst) return btrfs_get_raid_extent_offset(fs_info, logical, length, map->type, io_geom->stripe_index, dst); @@ -6362,7 +6413,7 @@ static bool is_single_device_io(struct btrfs_fs_info *fs_info, const struct btrfs_io_stripe *smap, const struct btrfs_chunk_map *map, int num_alloc_stripes, - enum btrfs_map_op op, int mirror_num) + struct btrfs_io_geometry *io_geom) { if (!smap) return false; @@ -6370,10 +6421,10 @@ static bool is_single_device_io(struct btrfs_fs_info *fs_info, if (num_alloc_stripes != 1) return false; - if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ) + if (io_geom->use_rst && io_geom->op != BTRFS_MAP_READ) return false; - if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) + if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && io_geom->mirror_num > 1) return false; return true; @@ -6488,7 +6539,7 @@ static void map_blocks_raid56_read(struct btrfs_chunk_map *map, { int data_stripes = nr_data_stripes(map); - ASSERT(io_geom->mirror_num <= 1); + ASSERT(io_geom->mirror_num <= 1, "mirror_num=%d", io_geom->mirror_num); /* Just grab the data stripe directly. */ io_geom->stripe_index = io_geom->stripe_nr % data_stripes; io_geom->stripe_nr /= data_stripes; @@ -6556,7 +6607,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int num_copies; struct btrfs_io_context *bioc = NULL; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - int dev_replace_is_ongoing = 0; + bool dev_replace_is_ongoing = false; u16 num_alloc_stripes; u64 max_len; @@ -6579,6 +6630,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, io_geom.raid56_full_stripe_start = (u64)-1; max_len = btrfs_max_io_len(map, map_offset, &io_geom); *length = min_t(u64, map->chunk_len - map_offset, max_len); + io_geom.use_rst = btrfs_need_stripe_tree_update(fs_info, map->type); if (dev_replace->replace_task != current) down_read(&dev_replace->rwsem); @@ -6647,8 +6699,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * physical block information on the stack instead of allocating an * I/O context structure. */ - if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op, - io_geom.mirror_num)) { + if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, &io_geom)) { ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom); if (mirror_num_ret) *mirror_num_ret = io_geom.mirror_num; @@ -6662,6 +6713,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, goto out; } bioc->map_type = map->type; + bioc->use_rst = io_geom.use_rst; /* * For RAID56 full map, we need to make sure the stripes[] follows the @@ -6860,7 +6912,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, atomic_set(&dev->dev_stats_ccnt, 0); btrfs_device_data_ordered_init(dev); - extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE); + btrfs_extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE); if (devid) tmp = *devid; @@ -6881,9 +6933,9 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, generate_random_uuid(dev->uuid); if (path) { - struct rcu_string *name; + const char *name; - name = rcu_string_strdup(path, GFP_KERNEL); + name = kstrdup(path, GFP_KERNEL); if (!name) { btrfs_free_device(dev); return ERR_PTR(-ENOMEM); @@ -7002,16 +7054,6 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, warn_32bit_meta_chunk(fs_info, logical, length, type); #endif - /* - * Only need to verify chunk item if we're reading from sys chunk array, - * as chunk item in tree block is already verified by tree-checker. - */ - if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { - ret = btrfs_check_chunk_valid(leaf, chunk, logical); - if (ret) - return ret; - } - map = btrfs_find_chunk_map(fs_info, logical, 1); /* already mapped? */ @@ -7072,6 +7114,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, btrfs_err(fs_info, "failed to add chunk map, start=%llu len=%llu: %d", map->start, map->chunk_len, ret); + btrfs_free_chunk_map(map); } return ret; @@ -7117,8 +7160,12 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, fs_devices = find_fsid(fsid, NULL); if (!fs_devices) { - if (!btrfs_test_opt(fs_info, DEGRADED)) + if (!btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_err(fs_info, + "failed to find fsid %pU when attempting to open seed devices", + fsid); return ERR_PTR(-ENOENT); + } fs_devices = alloc_fs_devices(fsid); if (IS_ERR(fs_devices)) @@ -7137,7 +7184,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, if (IS_ERR(fs_devices)) return fs_devices; - ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->bdev_holder); + ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->sb); if (ret) { free_fs_devices(fs_devices); return ERR_PTR(ret); @@ -7269,16 +7316,11 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) { struct btrfs_super_block *super_copy = fs_info->super_copy; struct extent_buffer *sb; - struct btrfs_disk_key *disk_key; - struct btrfs_chunk *chunk; u8 *array_ptr; unsigned long sb_array_offset; int ret = 0; - u32 num_stripes; u32 array_size; - u32 len = 0; u32 cur_offset; - u64 type; struct btrfs_key key; ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); @@ -7301,10 +7343,15 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) cur_offset = 0; while (cur_offset < array_size) { - disk_key = (struct btrfs_disk_key *)array_ptr; - len = sizeof(*disk_key); - if (cur_offset + len > array_size) - goto out_short_read; + struct btrfs_chunk *chunk; + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)array_ptr; + u32 len = sizeof(*disk_key); + + /* + * The sys_chunk_array has been already verified at super block + * read time. Only do ASSERT()s for basic checks. + */ + ASSERT(cur_offset + len <= array_size); btrfs_disk_key_to_cpu(&key, disk_key); @@ -7312,44 +7359,14 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) sb_array_offset += len; cur_offset += len; - if (key.type != BTRFS_CHUNK_ITEM_KEY) { - btrfs_err(fs_info, - "unexpected item type %u in sys_array at offset %u", - (u32)key.type, cur_offset); - ret = -EIO; - break; - } + ASSERT(key.type == BTRFS_CHUNK_ITEM_KEY); chunk = (struct btrfs_chunk *)sb_array_offset; - /* - * At least one btrfs_chunk with one stripe must be present, - * exact stripe count check comes afterwards - */ - len = btrfs_chunk_item_size(1); - if (cur_offset + len > array_size) - goto out_short_read; - - num_stripes = btrfs_chunk_num_stripes(sb, chunk); - if (!num_stripes) { - btrfs_err(fs_info, - "invalid number of stripes %u in sys_array at offset %u", - num_stripes, cur_offset); - ret = -EIO; - break; - } + ASSERT(btrfs_chunk_type(sb, chunk) & BTRFS_BLOCK_GROUP_SYSTEM); - type = btrfs_chunk_type(sb, chunk); - if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { - btrfs_err(fs_info, - "invalid chunk type %llu in sys_array at offset %u", - type, cur_offset); - ret = -EIO; - break; - } + len = btrfs_chunk_item_size(btrfs_chunk_num_stripes(sb, chunk)); - len = btrfs_chunk_item_size(num_stripes); - if (cur_offset + len > array_size) - goto out_short_read; + ASSERT(cur_offset + len <= array_size); ret = read_one_chunk(&key, sb, chunk); if (ret) @@ -7362,13 +7379,6 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) clear_extent_buffer_uptodate(sb); free_extent_buffer_stale(sb); return ret; - -out_short_read: - btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", - len, cur_offset); - clear_extent_buffer_uptodate(sb); - free_extent_buffer_stale(sb); - return -EIO; } /* @@ -7488,8 +7498,8 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). */ key.objectid = BTRFS_DEV_ITEMS_OBJECTID; - key.offset = 0; key.type = 0; + key.offset = 0; btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct extent_buffer *node = path->nodes[1]; @@ -7568,8 +7578,6 @@ int btrfs_init_devices_late(struct btrfs_fs_info *fs_info) struct btrfs_device *device; int ret = 0; - fs_devices->fs_info = fs_info; - mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) device->fs_info = fs_info; @@ -7708,7 +7716,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, return -ENOMEM; ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { - btrfs_warn_in_rcu(fs_info, + btrfs_warn(fs_info, "error %d while searching for dev_stats item for device %s", ret, btrfs_dev_name(device)); goto out; @@ -7719,7 +7727,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, /* need to delete old one and insert a new one */ ret = btrfs_del_item(trans, dev_root, path); if (ret != 0) { - btrfs_warn_in_rcu(fs_info, + btrfs_warn(fs_info, "delete too small dev_stats item for device %s failed %d", btrfs_dev_name(device), ret); goto out; @@ -7733,7 +7741,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, dev_root, path, &key, sizeof(*ptr)); if (ret < 0) { - btrfs_warn_in_rcu(fs_info, + btrfs_warn(fs_info, "insert dev_stats item for device %s failed %d", btrfs_dev_name(device), ret); goto out; @@ -7745,8 +7753,6 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) btrfs_set_dev_stats_value(eb, ptr, i, btrfs_dev_stat_read(device, i)); - btrfs_mark_buffer_dirty(trans, eb); - out: btrfs_free_path(path); return ret; @@ -7798,7 +7804,7 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) if (!dev->dev_stats_valid) return; - btrfs_err_rl_in_rcu(dev->fs_info, + btrfs_err_rl(dev->fs_info, "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", btrfs_dev_name(dev), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), @@ -7818,7 +7824,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) if (i == BTRFS_DEV_STAT_VALUES_MAX) return; /* all values == 0, suppress message */ - btrfs_info_in_rcu(dev->fs_info, + btrfs_info(dev->fs_info, "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", btrfs_dev_name(dev), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), @@ -7878,7 +7884,7 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans) { struct btrfs_device *curr, *next; - ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); + ASSERT(trans->state == TRANS_STATE_COMMIT_DOING, "state=%d" , trans->state); if (list_empty(&trans->dev_update_list)) return; @@ -7942,7 +7948,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } /* - * Very old mkfs.btrfs (before v4.1) will not respect the reserved + * Very old mkfs.btrfs (before v4.15) will not respect the reserved * space. Although kernel can handle it without problem, better to warn * the users. */ @@ -8194,7 +8200,7 @@ static int relocating_repair_kthread(void *data) btrfs_info(fs_info, "zoned: relocating block group %llu to repair IO failure", target); - ret = btrfs_relocate_chunk(fs_info, target); + ret = btrfs_relocate_chunk(fs_info, target, true); out: if (cache) @@ -8247,7 +8253,7 @@ static void map_raid56_repair_block(struct btrfs_io_context *bioc, logical < stripe_start + BTRFS_STRIPE_LEN) break; } - ASSERT(i < data_stripes); + ASSERT(i < data_stripes, "i=%d data_stripes=%d", i, data_stripes); smap->dev = bioc->stripes[i].dev; smap->physical = bioc->stripes[i].physical + ((logical - bioc->full_stripe_logical) & @@ -8276,7 +8282,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, int mirror_ret = mirror_num; int ret; - ASSERT(mirror_num > 0); + ASSERT(mirror_num > 0, "mirror_num=%d", mirror_num); ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, &bioc, smap, &mirror_ret); @@ -8284,7 +8290,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, return ret; /* The map range should not cross stripe boundary. */ - ASSERT(map_length >= length); + ASSERT(map_length >= length, "map_length=%llu length=%u", map_length, length); /* Already mapped to single stripe. */ if (!bioc) @@ -8296,7 +8302,8 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, goto out; } - ASSERT(mirror_num <= bioc->num_stripes); + ASSERT(mirror_num <= bioc->num_stripes, + "mirror_num=%d num_stripes=%d", mirror_num, bioc->num_stripes); smap->dev = bioc->stripes[mirror_num - 1].dev; smap->physical = bioc->stripes[mirror_num - 1].physical; out: diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 3a416b1bc24c..a56e873a3029 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -7,6 +7,7 @@ #define BTRFS_VOLUMES_H #include <linux/blk_types.h> +#include <linux/blkdev.h> #include <linux/sizes.h> #include <linux/atomic.h> #include <linux/sort.h> @@ -18,14 +19,16 @@ #include <linux/completion.h> #include <linux/rbtree.h> #include <uapi/linux/btrfs.h> +#include <uapi/linux/btrfs_tree.h> #include "messages.h" -#include "rcu-string.h" +#include "extent-io-tree.h" struct block_device; struct bdev_handle; struct btrfs_fs_info; struct btrfs_block_group; struct btrfs_trans_handle; +struct btrfs_transaction; struct btrfs_zoned_device_info; #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) @@ -110,7 +113,8 @@ struct btrfs_device { struct btrfs_fs_devices *fs_devices; struct btrfs_fs_info *fs_info; - struct rcu_string __rcu *name; + /* Device path or NULL if missing. */ + const char __rcu *name; u64 generation; @@ -296,6 +300,9 @@ enum btrfs_chunk_allocation_policy { BTRFS_CHUNK_ALLOC_ZONED, }; +#define BTRFS_DEFAULT_RR_MIN_CONTIG_READ (SZ_256K) +/* Keep in sync with raid_attr table, current maximum is RAID1C4. */ +#define BTRFS_RAID1_MAX_MIRRORS (4) /* * Read policies for mirrored block group profiles, read picks the stripe based * on these policies. @@ -303,6 +310,12 @@ enum btrfs_chunk_allocation_policy { enum btrfs_read_policy { /* Use process PID to choose the stripe */ BTRFS_READ_POLICY_PID, +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* Balancing RAID1 reads across all striped devices (round-robin). */ + BTRFS_READ_POLICY_RR, + /* Read from a specific device. */ + BTRFS_READ_POLICY_DEVID, +#endif BTRFS_NR_READ_POLICY, }; @@ -409,6 +422,16 @@ struct btrfs_fs_devices { /* Count fs-devices opened. */ int opened; + /* + * Counter of the processes that are holding this fs_devices but not + * yet opened. + * This is for mounting handling, as we can only open the fs_devices + * after a super block is created. But we cannot take uuid_mutex + * during sget_fc(), thus we have to hold the fs_devices (meaning it + * cannot be released) until a super block is returned. + */ + int holding; + /* Set when we find or add a device that doesn't have the nonrot flag set. */ bool rotating; /* Devices support TRIM/discard commands. */ @@ -417,6 +440,8 @@ struct btrfs_fs_devices { bool seeding; /* The mount needs to use a randomly generated fsid. */ bool temp_fsid; + /* Enable/disable the filesystem stats tracking. */ + bool collect_fs_stats; struct btrfs_fs_info *fs_info; /* sysfs kobjects */ @@ -431,6 +456,15 @@ struct btrfs_fs_devices { enum btrfs_read_policy read_policy; #ifdef CONFIG_BTRFS_EXPERIMENTAL + /* + * Minimum contiguous reads before switching to next device, the unit + * is one block/sectorsize. + */ + u32 rr_min_contig_read; + + /* Device to be used for reading in case of RAID1. */ + u64 read_devid; + /* Checksum mode - offload it or do it synchronously. */ enum btrfs_offload_csum_mode offload_csum_mode; #endif @@ -449,7 +483,6 @@ struct btrfs_io_stripe { struct btrfs_device *dev; /* Block mapping. */ u64 physical; - u64 length; bool rst_search_commit_root; /* For the endio handler. */ struct btrfs_io_context *bioc; @@ -485,6 +518,7 @@ struct btrfs_io_context { struct bio *orig_bio; atomic_t error; u16 max_errors; + bool use_rst; u64 logical; u64 size; @@ -643,7 +677,7 @@ enum btrfs_map_op { BTRFS_MAP_GET_READ_MIRRORS, }; -static inline enum btrfs_map_op btrfs_op(struct bio *bio) +static inline enum btrfs_map_op btrfs_op(const struct bio *bio) { switch (bio_op(bio)) { case REQ_OP_WRITE: @@ -690,12 +724,12 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, - u64 type); + struct btrfs_space_info *space_info, + u64 type); void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, blk_mode_t flags, void *holder); -struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, - bool mount_arg_dev); +struct btrfs_device *btrfs_scan_one_device(const char *path, bool mount_arg_dev); int btrfs_forget_devices(dev_t devt); void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices); @@ -729,7 +763,8 @@ void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf); int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); -int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset); +int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset, + bool verbose); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); @@ -761,6 +796,8 @@ struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_inf struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, u64 logical, u64 length); void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map); +struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, + int copy_num, bool drop_cache); void btrfs_release_disk_super(struct btrfs_super_block *super); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, @@ -819,7 +856,26 @@ static inline const char *btrfs_dev_name(const struct btrfs_device *device) if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) return "<missing disk>"; else - return rcu_str_deref(device->name); + return rcu_dereference(device->name); +} + +static inline void btrfs_warn_unknown_chunk_allocation(enum btrfs_chunk_allocation_policy pol) +{ + WARN_ONCE(1, "unknown allocation policy %d, fallback to regular", pol); +} + +static inline void btrfs_fs_devices_inc_holding(struct btrfs_fs_devices *fs_devices) +{ + lockdep_assert_held(&uuid_mutex); + ASSERT(fs_devices->holding >= 0); + fs_devices->holding++; +} + +static inline void btrfs_fs_devices_dec_holding(struct btrfs_fs_devices *fs_devices) +{ + lockdep_assert_held(&uuid_mutex); + ASSERT(fs_devices->holding > 0); + fs_devices->holding--; } void btrfs_commit_device_sizes(struct btrfs_transaction *trans); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index bc18710d1dcf..79fb1614bd0c 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -204,7 +204,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, btrfs_set_dir_data_len(leaf, di, size); data_ptr = ((unsigned long)(di + 1)) + name_len; write_extent_buffer(leaf, value, data_ptr, size); - btrfs_mark_buffer_dirty(trans, leaf); } else { /* * Insert, and we had space for the xattr, so path->slots[0] is @@ -511,14 +510,15 @@ static int btrfs_initxattrs(struct inode *inode, */ nofs_flag = memalloc_nofs_save(); for (xattr = xattr_array; xattr->name != NULL; xattr++) { - name = kmalloc(XATTR_SECURITY_PREFIX_LEN + - strlen(xattr->name) + 1, GFP_KERNEL); + const size_t name_len = XATTR_SECURITY_PREFIX_LEN + + strlen(xattr->name) + 1; + + name = kmalloc(name_len, GFP_KERNEL); if (!name) { ret = -ENOMEM; break; } - strcpy(name, XATTR_SECURITY_PREFIX); - strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); + scnprintf(name, name_len, "%s%s", XATTR_SECURITY_PREFIX, xattr->name); if (strcmp(name, XATTR_NAME_CAPS) == 0) clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 8dc4cf49f6f0..0ce10e4ec836 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -6,6 +6,8 @@ #ifndef BTRFS_XATTR_H #define BTRFS_XATTR_H +#include <linux/types.h> + struct dentry; struct inode; struct qstr; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index ddf0d5a448a7..5292cd341f70 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -94,6 +94,45 @@ fail: return ERR_PTR(-ENOMEM); } +/* + * Helper for S390x with hardware zlib compression support. + * + * That hardware acceleration requires a buffer size larger than a single page + * to get ideal performance, thus we need to do the memory copy rather than + * use the page cache directly as input buffer. + */ +static int copy_data_into_buffer(struct address_space *mapping, + struct workspace *workspace, u64 filepos, + unsigned long length) +{ + u64 cur = filepos; + + /* It's only for hardware accelerated zlib code. */ + ASSERT(zlib_deflate_dfltcc_enabled()); + + while (cur < filepos + length) { + struct folio *folio; + void *data_in; + unsigned int offset; + unsigned long copy_length; + int ret; + + ret = btrfs_compress_filemap_get_folio(mapping, cur, &folio); + if (ret < 0) + return ret; + + offset = offset_in_folio(folio, cur); + copy_length = min(folio_size(folio) - offset, + filepos + length - cur); + + data_in = kmap_local_folio(folio, offset); + memcpy(workspace->buf + cur - filepos, data_in, copy_length); + kunmap_local(data_in); + cur += copy_length; + } + return 0; +} + int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) @@ -105,8 +144,6 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, int nr_folios = 0; struct folio *in_folio = NULL; struct folio *out_folio = NULL; - unsigned long bytes_left; - unsigned int in_buf_folios; unsigned long len = *total_out; unsigned long nr_dest_folios = *out_folios; const unsigned long max_out = nr_dest_folios * PAGE_SIZE; @@ -150,36 +187,22 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, * the workspace buffer if required. */ if (workspace->strm.avail_in == 0) { - bytes_left = len - workspace->strm.total_in; - in_buf_folios = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE), - workspace->buf_size / PAGE_SIZE); - if (in_buf_folios > 1) { - int i; - - /* S390 hardware acceleration path, not subpage. */ - ASSERT(!btrfs_is_subpage( - inode_to_fs_info(mapping->host), - mapping)); - for (i = 0; i < in_buf_folios; i++) { - if (data_in) { - kunmap_local(data_in); - folio_put(in_folio); - data_in = NULL; - } - ret = btrfs_compress_filemap_get_folio(mapping, - start, &in_folio); - if (ret < 0) - goto out; - data_in = kmap_local_folio(in_folio, 0); - copy_page(workspace->buf + i * PAGE_SIZE, - data_in); - start += PAGE_SIZE; - workspace->strm.avail_in = - (in_buf_folios << PAGE_SHIFT); - } + unsigned long bytes_left = len - workspace->strm.total_in; + unsigned int copy_length = min(bytes_left, workspace->buf_size); + + /* + * This can only happen when hardware zlib compression is + * enabled. + */ + if (copy_length > PAGE_SIZE) { + ret = copy_data_into_buffer(mapping, workspace, + start, copy_length); + if (ret < 0) + goto out; + start += copy_length; workspace->strm.next_in = workspace->buf; + workspace->strm.avail_in = copy_length; } else { - unsigned int pg_off; unsigned int cur_len; if (data_in) { @@ -191,9 +214,9 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, start, &in_folio); if (ret < 0) goto out; - pg_off = offset_in_page(start); - cur_len = btrfs_calc_input_length(orig_end, start); - data_in = kmap_local_folio(in_folio, pg_off); + cur_len = btrfs_calc_input_length(in_folio, orig_end, start); + data_in = kmap_local_folio(in_folio, + offset_in_folio(in_folio, start)); start += cur_len; workspace->strm.next_in = data_in; workspace->strm.avail_in = cur_len; @@ -463,6 +486,7 @@ out: const struct btrfs_compress_op btrfs_zlib_compress = { .workspace_manager = &wsm, + .min_level = 1, .max_level = 9, .default_level = BTRFS_ZLIB_DEFAULT_LEVEL, }; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 11ed523e528e..245e813ecd78 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -9,7 +9,6 @@ #include "ctree.h" #include "volumes.h" #include "zoned.h" -#include "rcu-string.h" #include "disk-io.h" #include "block-group.h" #include "dev-replace.h" @@ -17,6 +16,7 @@ #include "fs.h" #include "accessors.h" #include "bio.h" +#include "transaction.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 @@ -263,9 +263,9 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, copy_zone_info_cb, zones); if (ret < 0) { - btrfs_err_in_rcu(device->fs_info, + btrfs_err(device->fs_info, "zoned: failed to read zone %llu on %s (devid %llu)", - pos, rcu_str_deref(device->name), + pos, rcu_dereference(device->name), device->devid); return ret; } @@ -395,16 +395,16 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) /* We reject devices with a zone size larger than 8GB */ if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) { - btrfs_err_in_rcu(fs_info, + btrfs_err(fs_info, "zoned: %s: zone size %llu larger than supported maximum %llu", - rcu_str_deref(device->name), + rcu_dereference(device->name), zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); ret = -EINVAL; goto out; } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) { - btrfs_err_in_rcu(fs_info, + btrfs_err(fs_info, "zoned: %s: zone size %llu smaller than supported minimum %u", - rcu_str_deref(device->name), + rcu_dereference(device->name), zone_info->zone_size, BTRFS_MIN_ZONE_SIZE); ret = -EINVAL; goto out; @@ -418,9 +418,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) max_active_zones = bdev_max_active_zones(bdev); if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) { - btrfs_err_in_rcu(fs_info, + btrfs_err(fs_info, "zoned: %s: max active zones %u is too small, need at least %u active zones", - rcu_str_deref(device->name), max_active_zones, + rcu_dereference(device->name), max_active_zones, BTRFS_MIN_ACTIVE_ZONES); ret = -EINVAL; goto out; @@ -460,9 +460,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) zone_info->zone_cache = vcalloc(zone_info->nr_zones, sizeof(struct blk_zone)); if (!zone_info->zone_cache) { - btrfs_err_in_rcu(device->fs_info, + btrfs_err(device->fs_info, "zoned: failed to allocate zone cache for %s", - rcu_str_deref(device->name)); + rcu_dereference(device->name)); ret = -ENOMEM; goto out; } @@ -497,9 +497,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) } if (nreported != zone_info->nr_zones) { - btrfs_err_in_rcu(device->fs_info, + btrfs_err(device->fs_info, "inconsistent number of zones on %s (%u/%u)", - rcu_str_deref(device->name), nreported, + rcu_dereference(device->name), nreported, zone_info->nr_zones); ret = -EIO; goto out; @@ -507,9 +507,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) if (max_active_zones) { if (nactive > max_active_zones) { - btrfs_err_in_rcu(device->fs_info, + btrfs_err(device->fs_info, "zoned: %u active zones on %s exceeds max_active_zones %u", - nactive, rcu_str_deref(device->name), + nactive, rcu_dereference(device->name), max_active_zones); ret = -EIO; goto out; @@ -538,7 +538,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) goto out; if (nr_zones != BTRFS_NR_SB_LOG_ZONES) { - btrfs_err_in_rcu(device->fs_info, + btrfs_err(device->fs_info, "zoned: failed to read super block log zone info at devid %llu zone %u", device->devid, sb_zone); ret = -EUCLEAN; @@ -556,7 +556,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) ret = sb_write_pointer(device->bdev, &zone_info->sb_zones[sb_pos], &sb_wp); if (ret != -ENOENT && ret) { - btrfs_err_in_rcu(device->fs_info, + btrfs_err(device->fs_info, "zoned: super block log zone corrupted devid %llu zone %u", device->devid, sb_zone); ret = -EUCLEAN; @@ -575,9 +575,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) emulated = "emulated "; } - btrfs_info_in_rcu(fs_info, + btrfs_info(fs_info, "%s block device %s, %u %szones of %llu bytes", - model, rcu_str_deref(device->name), zone_info->nr_zones, + model, rcu_dereference(device->name), zone_info->nr_zones, emulated, zone_info->zone_size); return 0; @@ -748,8 +748,9 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) (u64)lim->max_segments << PAGE_SHIFT), fs_info->sectorsize); fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; - if (fs_info->max_zone_append_size < fs_info->max_extent_size) - fs_info->max_extent_size = fs_info->max_zone_append_size; + + fs_info->max_extent_size = min_not_zero(fs_info->max_extent_size, + fs_info->max_zone_append_size); /* * Check mount options here, because we might change fs_info->zoned @@ -988,7 +989,7 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) } /* All the zones are FULL. Should not reach here. */ - ASSERT(0); + DEBUG_WARN("unexpected state, all zones full"); return -EIO; } @@ -1181,10 +1182,10 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) continue; /* Free regions should be empty */ - btrfs_warn_in_rcu( + btrfs_warn( device->fs_info, "zoned: resetting device %s (devid %llu) zone %llu for allocation", - rcu_str_deref(device->name), device->devid, pos >> shift); + rcu_dereference(device->name), device->devid, pos >> shift); WARN_ON_ONCE(1); ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size, @@ -1276,7 +1277,7 @@ struct zone_info { static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, struct zone_info *info, unsigned long *active, - struct btrfs_chunk_map *map) + struct btrfs_chunk_map *map, bool new) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; struct btrfs_device *device; @@ -1306,6 +1307,8 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, return 0; } + ASSERT(!new || btrfs_dev_is_empty_zone(device, info->physical)); + /* This zone will be used for allocation, so mark this zone non-empty. */ btrfs_dev_clear_zone_empty(device, info->physical); @@ -1318,6 +1321,18 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, * to determine the allocation offset within the zone. */ WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size)); + + if (new) { + sector_t capacity; + + capacity = bdev_zone_capacity(device->bdev, info->physical >> SECTOR_SHIFT); + up_read(&dev_replace->rwsem); + info->alloc_offset = 0; + info->capacity = capacity << SECTOR_SHIFT; + + return 0; + } + nofs_flag = memalloc_nofs_save(); ret = btrfs_get_dev_zone(device, info->physical, &zone); memalloc_nofs_restore(nofs_flag); @@ -1330,9 +1345,9 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, } if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { - btrfs_err_in_rcu(fs_info, + btrfs_err(fs_info, "zoned: unexpected conventional zone %llu on device %s (devid %llu)", - zone.start << SECTOR_SHIFT, rcu_str_deref(device->name), + zone.start << SECTOR_SHIFT, rcu_dereference(device->name), device->devid); up_read(&dev_replace->rwsem); return -EIO; @@ -1343,10 +1358,10 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, switch (zone.cond) { case BLK_ZONE_COND_OFFLINE: case BLK_ZONE_COND_READONLY: - btrfs_err_in_rcu(fs_info, + btrfs_err(fs_info, "zoned: offline/readonly zone %llu on device %s (devid %llu)", (info->physical >> device->zone_info->zone_size_shift), - rcu_str_deref(device->name), device->devid); + rcu_dereference(device->name), device->devid); info->alloc_offset = WP_MISSING_DEV; break; case BLK_ZONE_COND_EMPTY: @@ -1388,7 +1403,8 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg, static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, - unsigned long *active) + unsigned long *active, + u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; @@ -1411,6 +1427,13 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, zone_info[1].physical); return -EIO; } + + if (zone_info[0].alloc_offset == WP_CONVENTIONAL) + zone_info[0].alloc_offset = last_alloc; + + if (zone_info[1].alloc_offset == WP_CONVENTIONAL) + zone_info[1].alloc_offset = last_alloc; + if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) { btrfs_err(bg->fs_info, "zoned: write pointer offset mismatch of zones in DUP profile"); @@ -1431,7 +1454,8 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, - unsigned long *active) + unsigned long *active, + u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; int i; @@ -1446,10 +1470,12 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); for (i = 0; i < map->num_stripes; i++) { - if (zone_info[i].alloc_offset == WP_MISSING_DEV || - zone_info[i].alloc_offset == WP_CONVENTIONAL) + if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; + if (zone_info[i].alloc_offset == WP_CONVENTIONAL) + zone_info[i].alloc_offset = last_alloc; + if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) && !btrfs_test_opt(fs_info, DEGRADED)) { btrfs_err(fs_info, @@ -1479,7 +1505,8 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, - unsigned long *active) + unsigned long *active, + u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; @@ -1490,10 +1517,29 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, } for (int i = 0; i < map->num_stripes; i++) { - if (zone_info[i].alloc_offset == WP_MISSING_DEV || - zone_info[i].alloc_offset == WP_CONVENTIONAL) + if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; + if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { + u64 stripe_nr, full_stripe_nr; + u64 stripe_offset; + int stripe_index; + + stripe_nr = div64_u64(last_alloc, map->stripe_size); + stripe_offset = stripe_nr * map->stripe_size; + full_stripe_nr = div_u64(stripe_nr, map->num_stripes); + div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); + + zone_info[i].alloc_offset = + full_stripe_nr * map->stripe_size; + + if (stripe_index > i) + zone_info[i].alloc_offset += map->stripe_size; + else if (stripe_index == i) + zone_info[i].alloc_offset += + (last_alloc - stripe_offset); + } + if (test_bit(0, active) != test_bit(i, active)) { if (!btrfs_zone_activate(bg)) return -EIO; @@ -1511,7 +1557,8 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, - unsigned long *active) + unsigned long *active, + u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; @@ -1522,8 +1569,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, } for (int i = 0; i < map->num_stripes; i++) { - if (zone_info[i].alloc_offset == WP_MISSING_DEV || - zone_info[i].alloc_offset == WP_CONVENTIONAL) + if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; if (test_bit(0, active) != test_bit(i, active)) { @@ -1534,6 +1580,29 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); } + if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { + u64 stripe_nr, full_stripe_nr; + u64 stripe_offset; + int stripe_index; + + stripe_nr = div64_u64(last_alloc, map->stripe_size); + stripe_offset = stripe_nr * map->stripe_size; + full_stripe_nr = div_u64(stripe_nr, + map->num_stripes / map->sub_stripes); + div_u64_rem(stripe_nr, + (map->num_stripes / map->sub_stripes), + &stripe_index); + + zone_info[i].alloc_offset = + full_stripe_nr * map->stripe_size; + + if (stripe_index > (i / map->sub_stripes)) + zone_info[i].alloc_offset += map->stripe_size; + else if (stripe_index == (i / map->sub_stripes)) + zone_info[i].alloc_offset += + (last_alloc - stripe_offset); + } + if ((i % map->sub_stripes) == 0) { bg->zone_capacity += zone_info[i].capacity; bg->alloc_offset += zone_info[i].alloc_offset; @@ -1587,7 +1656,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } for (i = 0; i < map->num_stripes; i++) { - ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map); + ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map, new); if (ret) goto out; @@ -1622,18 +1691,22 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) ret = btrfs_load_block_group_single(cache, &zone_info[0], active); break; case BTRFS_BLOCK_GROUP_DUP: - ret = btrfs_load_block_group_dup(cache, map, zone_info, active); + ret = btrfs_load_block_group_dup(cache, map, zone_info, active, + last_alloc); break; case BTRFS_BLOCK_GROUP_RAID1: case BTRFS_BLOCK_GROUP_RAID1C3: case BTRFS_BLOCK_GROUP_RAID1C4: - ret = btrfs_load_block_group_raid1(cache, map, zone_info, active); + ret = btrfs_load_block_group_raid1(cache, map, zone_info, + active, last_alloc); break; case BTRFS_BLOCK_GROUP_RAID0: - ret = btrfs_load_block_group_raid0(cache, map, zone_info, active); + ret = btrfs_load_block_group_raid0(cache, map, zone_info, + active, last_alloc); break; case BTRFS_BLOCK_GROUP_RAID10: - ret = btrfs_load_block_group_raid10(cache, map, zone_info, active); + ret = btrfs_load_block_group_raid10(cache, map, zone_info, + active, last_alloc); break; case BTRFS_BLOCK_GROUP_RAID5: case BTRFS_BLOCK_GROUP_RAID6: @@ -1658,7 +1731,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) * stripe. */ cache->alloc_offset = cache->zone_capacity; - ret = 0; } out: @@ -1783,12 +1855,12 @@ static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered, ordered->disk_bytenr = logical; write_lock(&em_tree->lock); - em = search_extent_mapping(em_tree, ordered->file_offset, - ordered->num_bytes); + em = btrfs_search_extent_mapping(em_tree, ordered->file_offset, + ordered->num_bytes); /* The em should be a new COW extent, thus it should not have an offset. */ ASSERT(em->offset == 0); em->disk_bytenr = logical; - free_extent_map(em); + btrfs_free_extent_map(em); write_unlock(&em_tree->lock); } @@ -1798,8 +1870,8 @@ static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered, struct btrfs_ordered_extent *new; if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && - split_extent_map(ordered->inode, ordered->file_offset, - ordered->num_bytes, len, logical)) + btrfs_split_extent_map(ordered->inode, ordered->file_offset, + ordered->num_bytes, len, logical)) return false; new = btrfs_split_ordered_extent(ordered, len); @@ -2110,6 +2182,9 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) physical = map->stripes[i].physical; zinfo = device->zone_info; + if (!device->bdev) + continue; + if (zinfo->max_active_zones == 0) continue; @@ -2154,27 +2229,15 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; const u64 end = block_group->start + block_group->length; - struct radix_tree_iter iter; struct extent_buffer *eb; - void __rcu **slot; + unsigned long index, start = (block_group->start >> fs_info->sectorsize_bits); rcu_read_lock(); - radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, - block_group->start >> fs_info->sectorsize_bits) { - eb = radix_tree_deref_slot(slot); - if (!eb) - continue; - if (radix_tree_deref_retry(eb)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - + xa_for_each_start(&fs_info->buffer_tree, index, eb, start) { if (eb->start < block_group->start) continue; if (eb->start >= end) break; - - slot = radix_tree_iter_resume(slot, &iter); rcu_read_unlock(); wait_on_extent_buffer_writeback(eb); rcu_read_lock(); @@ -2271,6 +2334,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ struct btrfs_zoned_device_info *zinfo = device->zone_info; unsigned int nofs_flags; + if (!device->bdev) + continue; + if (zinfo->max_active_zones == 0) continue; @@ -2324,6 +2390,9 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) if (!btrfs_is_zoned(fs_info)) return true; + if (test_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags)) + return false; + /* Check if there is a device with active zones left */ mutex_lock(&fs_info->chunk_mutex); spin_lock(&fs_info->zone_active_bgs_lock); @@ -2416,7 +2485,7 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, /* For the work */ btrfs_get_block_group(bg); - atomic_inc(&eb->refs); + refcount_inc(&eb->refs); bg->last_eb = eb; INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn); queue_work(system_unbound_wq, &bg->zone_finish_work); @@ -2432,6 +2501,66 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) spin_unlock(&fs_info->relocation_bg_lock); } +void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; + struct btrfs_space_info *space_info = data_sinfo->sub_group[0]; + struct btrfs_trans_handle *trans; + struct btrfs_block_group *bg; + struct list_head *bg_list; + u64 alloc_flags; + bool initial = false; + bool did_chunk_alloc = false; + int index; + int ret; + + if (!btrfs_is_zoned(fs_info)) + return; + + if (fs_info->data_reloc_bg) + return; + + if (sb_rdonly(fs_info->sb)) + return; + + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); + alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags); + index = btrfs_bg_flags_to_raid_index(alloc_flags); + + bg_list = &data_sinfo->block_groups[index]; +again: + list_for_each_entry(bg, bg_list, list) { + if (bg->used > 0) + continue; + + if (!initial) { + initial = true; + continue; + } + + fs_info->data_reloc_bg = bg->start; + set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &bg->runtime_flags); + btrfs_zone_activate(bg); + + return; + } + + if (did_chunk_alloc) + return; + + trans = btrfs_join_transaction(fs_info->tree_root); + if (IS_ERR(trans)) + return; + + ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); + btrfs_end_transaction(trans); + if (ret == 1) { + did_chunk_alloc = true; + bg_list = &space_info->block_groups[index]; + goto again; + } +} + void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; @@ -2454,8 +2583,8 @@ bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; + u64 total = btrfs_super_total_bytes(fs_info->super_copy); u64 used = 0; - u64 total = 0; u64 factor; ASSERT(btrfs_is_zoned(fs_info)); @@ -2468,7 +2597,6 @@ bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) if (!device->bdev) continue; - total += device->disk_total_bytes; used += device->bytes_used; } mutex_unlock(&fs_devices->device_list_mutex); @@ -2651,3 +2779,127 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->zone_active_bgs_lock); } + +/* + * Reset the zones of unused block groups from @space_info->bytes_zone_unusable. + * + * @space_info: the space to work on + * @num_bytes: targeting reclaim bytes + * + * This one resets the zones of a block group, so we can reuse the region + * without removing the block group. On the other hand, btrfs_delete_unused_bgs() + * just removes a block group and frees up the underlying zones. So, we still + * need to allocate a new block group to reuse the zones. + * + * Resetting is faster than deleting/recreating a block group. It is similar + * to freeing the logical space on the regular mode. However, we cannot change + * the block group's profile with this operation. + */ +int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; + const sector_t zone_size_sectors = fs_info->zone_size >> SECTOR_SHIFT; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + while (num_bytes > 0) { + struct btrfs_chunk_map *map; + struct btrfs_block_group *bg = NULL; + bool found = false; + u64 reclaimed = 0; + + /* + * Here, we choose a fully zone_unusable block group. It's + * technically possible to reset a partly zone_unusable block + * group, which still has some free space left. However, + * handling that needs to cope with the allocation side, which + * makes the logic more complex. So, let's handle the easy case + * for now. + */ + spin_lock(&fs_info->unused_bgs_lock); + list_for_each_entry(bg, &fs_info->unused_bgs, bg_list) { + if ((bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != space_info->flags) + continue; + + /* + * Use trylock to avoid locking order violation. In + * btrfs_reclaim_bgs_work(), the lock order is + * &bg->lock -> &fs_info->unused_bgs_lock. We skip a + * block group if we cannot take its lock. + */ + if (!spin_trylock(&bg->lock)) + continue; + if (btrfs_is_block_group_used(bg) || bg->zone_unusable < bg->length) { + spin_unlock(&bg->lock); + continue; + } + spin_unlock(&bg->lock); + found = true; + break; + } + if (!found) { + spin_unlock(&fs_info->unused_bgs_lock); + return 0; + } + + list_del_init(&bg->bg_list); + btrfs_put_block_group(bg); + spin_unlock(&fs_info->unused_bgs_lock); + + /* + * Since the block group is fully zone_unusable and we cannot + * allocate from this block group anymore, we don't need to set + * this block group read-only. + */ + + down_read(&fs_info->dev_replace.rwsem); + map = bg->physical_map; + for (int i = 0; i < map->num_stripes; i++) { + struct btrfs_io_stripe *stripe = &map->stripes[i]; + unsigned int nofs_flags; + int ret; + + nofs_flags = memalloc_nofs_save(); + ret = blkdev_zone_mgmt(stripe->dev->bdev, REQ_OP_ZONE_RESET, + stripe->physical >> SECTOR_SHIFT, + zone_size_sectors); + memalloc_nofs_restore(nofs_flags); + + if (ret) { + up_read(&fs_info->dev_replace.rwsem); + return ret; + } + } + up_read(&fs_info->dev_replace.rwsem); + + spin_lock(&space_info->lock); + spin_lock(&bg->lock); + ASSERT(!btrfs_is_block_group_used(bg)); + if (bg->ro) { + spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); + continue; + } + + reclaimed = bg->alloc_offset; + bg->zone_unusable = bg->length - bg->zone_capacity; + bg->alloc_offset = 0; + /* + * This holds because we currently reset fully used then freed + * block group. + */ + ASSERT(reclaimed == bg->zone_capacity); + bg->free_space_ctl->free_space += reclaimed; + space_info->bytes_zone_unusable -= reclaimed; + spin_unlock(&bg->lock); + btrfs_return_free_space(space_info, reclaimed); + spin_unlock(&space_info->lock); + + if (num_bytes <= reclaimed) + break; + num_bytes -= reclaimed; + } + + return 0; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 7612e6572605..6e11533b8e14 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -88,6 +88,7 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, struct extent_buffer *eb); void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); +void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info); void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info); void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, @@ -96,6 +97,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, bool do_finish); void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info); +int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) @@ -240,6 +242,8 @@ static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } +static inline void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info) { } + static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { } static inline bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) @@ -265,6 +269,12 @@ static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, static inline void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { } +static inline int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, + u64 num_bytes) +{ + return 0; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 5232b56d5892..ff0292615e1f 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -24,13 +24,14 @@ #include "super.h" #define ZSTD_BTRFS_MAX_WINDOWLOG 17 -#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) +#define ZSTD_BTRFS_MAX_INPUT (1U << ZSTD_BTRFS_MAX_WINDOWLOG) #define ZSTD_BTRFS_DEFAULT_LEVEL 3 +#define ZSTD_BTRFS_MIN_LEVEL -15 #define ZSTD_BTRFS_MAX_LEVEL 15 /* 307s to avoid pathologically clashing with transaction commit */ #define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ) -static zstd_parameters zstd_get_btrfs_parameters(unsigned int level, +static zstd_parameters zstd_get_btrfs_parameters(int level, size_t src_len) { zstd_parameters params = zstd_get_params(level, src_len); @@ -45,13 +46,14 @@ struct workspace { void *mem; size_t size; char *buf; - unsigned int level; - unsigned int req_level; + int level; + int req_level; unsigned long last_used; /* jiffies */ struct list_head list; struct list_head lru_list; zstd_in_buffer in_buf; zstd_out_buffer out_buf; + zstd_parameters params; }; /* @@ -93,8 +95,10 @@ static inline struct workspace *list_to_workspace(struct list_head *list) return container_of(list, struct workspace, list); } -void zstd_free_workspace(struct list_head *ws); -struct list_head *zstd_alloc_workspace(unsigned int level); +static inline int clip_level(int level) +{ + return max(0, level - 1); +} /* * Timer callback to free unused workspaces. @@ -123,7 +127,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) list_for_each_prev_safe(pos, next, &wsm.lru_list) { struct workspace *victim = container_of(pos, struct workspace, lru_list); - unsigned int level; + int level; if (time_after(victim->last_used, reclaim_threshold)) break; @@ -137,8 +141,8 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) list_del(&victim->list); zstd_free_workspace(&victim->list); - if (list_empty(&wsm.idle_ws[level - 1])) - clear_bit(level - 1, &wsm.active_map); + if (list_empty(&wsm.idle_ws[level])) + clear_bit(level, &wsm.active_map); } @@ -160,9 +164,11 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) static void zstd_calc_ws_mem_sizes(void) { size_t max_size = 0; - unsigned int level; + int level; - for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) { + for (level = ZSTD_BTRFS_MIN_LEVEL; level <= ZSTD_BTRFS_MAX_LEVEL; level++) { + if (level == 0) + continue; zstd_parameters params = zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT); size_t level_size = @@ -171,7 +177,8 @@ static void zstd_calc_ws_mem_sizes(void) zstd_dstream_workspace_bound(ZSTD_BTRFS_MAX_INPUT)); max_size = max_t(size_t, max_size, level_size); - zstd_ws_mem_sizes[level - 1] = max_size; + /* Use level 1 workspace size for all the fast mode negative levels. */ + zstd_ws_mem_sizes[clip_level(level)] = max_size; } } @@ -193,8 +200,7 @@ void zstd_init_workspace_manager(void) ws = zstd_alloc_workspace(ZSTD_BTRFS_MAX_LEVEL); if (IS_ERR(ws)) { - pr_warn( - "BTRFS: cannot preallocate zstd compression workspace\n"); + btrfs_warn(NULL, "cannot preallocate zstd compression workspace"); } else { set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map); list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]); @@ -218,7 +224,7 @@ void zstd_cleanup_workspace_manager(void) } spin_unlock_bh(&wsm.lock); - del_timer_sync(&wsm.timer); + timer_delete_sync(&wsm.timer); } /* @@ -233,11 +239,11 @@ void zstd_cleanup_workspace_manager(void) * offer the opportunity to reclaim the workspace in favor of allocating an * appropriately sized one in the future. */ -static struct list_head *zstd_find_workspace(unsigned int level) +static struct list_head *zstd_find_workspace(int level) { struct list_head *ws; struct workspace *workspace; - int i = level - 1; + int i = clip_level(level); spin_lock_bh(&wsm.lock); for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) { @@ -247,7 +253,7 @@ static struct list_head *zstd_find_workspace(unsigned int level) list_del_init(ws); /* keep its place if it's a lower level using this */ workspace->req_level = level; - if (level == workspace->level) + if (clip_level(level) == workspace->level) list_del(&workspace->lru_list); if (list_empty(&wsm.idle_ws[i])) clear_bit(i, &wsm.active_map); @@ -270,7 +276,7 @@ static struct list_head *zstd_find_workspace(unsigned int level) * attempt to allocate a new workspace. If we fail to allocate one due to * memory pressure, go to sleep waiting for the max level workspace to free up. */ -struct list_head *zstd_get_workspace(unsigned int level) +struct list_head *zstd_get_workspace(int level) { struct list_head *ws; unsigned int nofs_flag; @@ -319,7 +325,7 @@ void zstd_put_workspace(struct list_head *ws) spin_lock_bh(&wsm.lock); /* A node is only taken off the lru if we are the corresponding level */ - if (workspace->req_level == workspace->level) { + if (clip_level(workspace->req_level) == workspace->level) { /* Hide a max level workspace from reclaim */ if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) { INIT_LIST_HEAD(&workspace->lru_list); @@ -332,13 +338,13 @@ void zstd_put_workspace(struct list_head *ws) } } - set_bit(workspace->level - 1, &wsm.active_map); - list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]); + set_bit(workspace->level, &wsm.active_map); + list_add(&workspace->list, &wsm.idle_ws[workspace->level]); workspace->req_level = 0; spin_unlock_bh(&wsm.lock); - if (workspace->level == ZSTD_BTRFS_MAX_LEVEL) + if (workspace->level == clip_level(ZSTD_BTRFS_MAX_LEVEL)) cond_wake_up(&wsm.wait); } @@ -351,7 +357,7 @@ void zstd_free_workspace(struct list_head *ws) kfree(workspace); } -struct list_head *zstd_alloc_workspace(unsigned int level) +struct list_head *zstd_alloc_workspace(int level) { struct workspace *workspace; @@ -359,8 +365,9 @@ struct list_head *zstd_alloc_workspace(unsigned int level) if (!workspace) return ERR_PTR(-ENOMEM); - workspace->size = zstd_ws_mem_sizes[level - 1]; - workspace->level = level; + /* Use level 1 workspace size for all the fast mode negative levels. */ + workspace->size = zstd_ws_mem_sizes[clip_level(level)]; + workspace->level = clip_level(level); workspace->req_level = level; workspace->last_used = jiffies; workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN); @@ -393,17 +400,15 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, const unsigned long nr_dest_folios = *out_folios; const u64 orig_end = start + len; unsigned long max_out = nr_dest_folios * PAGE_SIZE; - unsigned int pg_off; unsigned int cur_len; - zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level, - len); + workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len); *out_folios = 0; *total_out = 0; *total_in = 0; /* Initialize the stream */ - stream = zstd_init_cstream(¶ms, len, workspace->mem, + stream = zstd_init_cstream(&workspace->params, len, workspace->mem, workspace->size); if (unlikely(!stream)) { struct btrfs_inode *inode = BTRFS_I(mapping->host); @@ -420,9 +425,8 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; - pg_off = offset_in_page(start); - cur_len = btrfs_calc_input_length(orig_end, start); - workspace->in_buf.src = kmap_local_folio(in_folio, pg_off); + cur_len = btrfs_calc_input_length(in_folio, orig_end, start); + workspace->in_buf.src = kmap_local_folio(in_folio, offset_in_folio(in_folio, start)); workspace->in_buf.pos = 0; workspace->in_buf.size = cur_len; @@ -506,9 +510,9 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; - pg_off = offset_in_page(start); - cur_len = btrfs_calc_input_length(orig_end, start); - workspace->in_buf.src = kmap_local_folio(in_folio, pg_off); + cur_len = btrfs_calc_input_length(in_folio, orig_end, start); + workspace->in_buf.src = kmap_local_folio(in_folio, + offset_in_folio(in_folio, start)); workspace->in_buf.pos = 0; workspace->in_buf.size = cur_len; } @@ -717,6 +721,7 @@ finish: const struct btrfs_compress_op btrfs_zstd_compress = { /* ZSTD uses own workspace manager */ .workspace_manager = NULL, + .min_level = ZSTD_BTRFS_MIN_LEVEL, .max_level = ZSTD_BTRFS_MAX_LEVEL, .default_level = ZSTD_BTRFS_DEFAULT_LEVEL, }; |