diff options
Diffstat (limited to 'fs/btrfs')
63 files changed, 2409 insertions, 1402 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 4fb925e8c981..fa8515598341 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -78,6 +78,32 @@ config BTRFS_ASSERT If unsure, say N. +config BTRFS_EXPERIMENTAL + bool "Btrfs experimental features" + depends on BTRFS_FS + default n + help + Enable experimental features. These features may not be stable enough + for end users. This is meant for btrfs developers or users who wish + to test the functionality and report problems. + + Current list: + + - extent map shrinker - performance problems with too frequent shrinks + + - send stream protocol v3 - fs-verity support + + - checksum offload mode - sysfs knob to affect when checksums are + calculated (at IO time, or in a thread) + + - raid-stripe-tree - additional mapping of extents to devices to + support RAID1* profiles on zoned devices, + RAID56 not yet supported + + - extent tree v2 - complex rework of extent tracking + + If unsure, say N. + config BTRFS_FS_REF_VERIFY bool "Btrfs with the ref verify tool compiled in" depends on BTRFS_FS diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 87617f2968bc..3cfc440c636c 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -43,4 +43,5 @@ btrfs-$(CONFIG_FS_VERITY) += verity.o btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \ - tests/free-space-tree-tests.o tests/extent-map-tests.o + tests/free-space-tree-tests.o tests/extent-map-tests.o \ + tests/raid-stripe-tree-tests.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f8e1d5b2c512..04f53ca548e1 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1442,7 +1442,8 @@ again: */ delayed_refs = &ctx->trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, ctx->bytenr); + head = btrfs_find_delayed_ref_head(ctx->fs_info, delayed_refs, + ctx->bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { refcount_inc(&head->refs); diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 7e0f9600b80c..1f216d07eff6 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -587,7 +587,7 @@ static bool should_async_write(struct btrfs_bio *bbio) { bool auto_csum_mode = true; -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices; enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 4423d8b716a5..4427c1b835e8 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2797,7 +2797,7 @@ next: * uncompressed data size, because the compression is only done * when writeback triggered and we don't know how much space we * are actually going to need, so we reserve the uncompressed - * size because the data may be uncompressible in the worst case. + * size because the data may be incompressible in the worst case. */ if (ret == 0) { bool used; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index e152fde888fc..aa1f55cd81b7 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -577,7 +577,6 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state struct extent_state *other); void btrfs_split_delalloc_extent(struct btrfs_inode *inode, struct extent_state *orig, u64 split); -void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); void btrfs_evict_inode(struct inode *inode); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); @@ -613,11 +612,17 @@ int btrfs_writepage_cow_fixup(struct folio *folio); int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type); int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - u64 file_offset, u64 disk_bytenr, - u64 disk_io_size, - struct page **pages); + u64 disk_bytenr, u64 disk_io_size, + struct page **pages, void *uring_ctx); ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, - struct btrfs_ioctl_encoded_io_args *encoded); + struct btrfs_ioctl_encoded_io_args *encoded, + struct extent_state **cached_state, + u64 *disk_bytenr, u64 *disk_io_size); +ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, + u64 start, u64 lockend, + struct extent_state **cached_state, + u64 disk_bytenr, u64 disk_io_size, + size_t count, bool compressed, bool *unlocked); ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 90aef2627ca2..0c4d486c3048 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -453,7 +453,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (pg_index > end_index) break; - folio = __filemap_get_folio(mapping, pg_index, 0, 0); + folio = filemap_get_folio(mapping, pg_index); if (!IS_ERR(folio)) { u64 folio_sz = folio_size(folio); u64 offset = offset_in_folio(folio, cur); @@ -545,8 +545,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, * subpage::readers and to unlock the page. */ if (fs_info->sectorsize < PAGE_SIZE) - btrfs_subpage_start_reader(fs_info, folio, cur, - add_size); + btrfs_folio_set_lock(fs_info, folio, cur, add_size); folio_put(folio); cur += add_size; } @@ -702,7 +701,7 @@ static void free_heuristic_ws(struct list_head *ws) kfree(workspace); } -static struct list_head *alloc_heuristic_ws(unsigned int level) +static struct list_head *alloc_heuristic_ws(void) { struct heuristic_ws *ws; @@ -744,9 +743,9 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = { static struct list_head *alloc_workspace(int type, unsigned int level) { switch (type) { - case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level); + case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(); case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level); - case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(level); + case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(); case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level); default: /* @@ -1030,6 +1029,7 @@ int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping { int type = btrfs_compress_type(type_level); int level = btrfs_compress_level(type_level); + const unsigned long orig_len = *total_out; struct list_head *workspace; int ret; @@ -1037,6 +1037,8 @@ int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping workspace = get_workspace(type, level); ret = compression_compress_pages(type, workspace, mapping, start, folios, out_folios, total_in, total_out); + /* The total read-in bytes should be no larger than the input. */ + ASSERT(*total_in <= orig_len); put_workspace(type, workspace); return ret; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index b6563b6a333e..954034086d0d 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -175,7 +175,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); -struct list_head *lzo_alloc_workspace(unsigned int level); +struct list_head *lzo_alloc_workspace(void); void lzo_free_workspace(struct list_head *ws); int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0cc919d15b14..148648ea1c8b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1508,26 +1508,26 @@ static noinline void unlock_up(struct btrfs_path *path, int level, */ static int read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, - struct extent_buffer **eb_ret, int level, int slot, + struct extent_buffer **eb_ret, int slot, const struct btrfs_key *key) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_tree_parent_check check = { 0 }; u64 blocknr; - u64 gen; - struct extent_buffer *tmp; - int ret; + struct extent_buffer *tmp = NULL; + int ret = 0; int parent_level; - bool unlock_up; + int err; + bool read_tmp = false; + bool tmp_locked = false; + bool path_released = false; - unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]); blocknr = btrfs_node_blockptr(*eb_ret, slot); - gen = btrfs_node_ptr_generation(*eb_ret, slot); parent_level = btrfs_header_level(*eb_ret); btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot); check.has_first_key = true; check.level = parent_level - 1; - check.transid = gen; + check.transid = btrfs_node_ptr_generation(*eb_ret, slot); check.owner_root = btrfs_root_id(root); /* @@ -1540,79 +1540,115 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, tmp = find_extent_buffer(fs_info, blocknr); if (tmp) { if (p->reada == READA_FORWARD_ALWAYS) - reada_for_search(fs_info, p, level, slot, key->objectid); + reada_for_search(fs_info, p, parent_level, slot, key->objectid); /* first we do an atomic uptodate check */ - if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { + if (btrfs_buffer_uptodate(tmp, check.transid, 1) > 0) { /* * Do extra check for first_key, eb can be stale due to * being cached, read from scrub, or have multiple * parents (shared tree blocks). */ - if (btrfs_verify_level_key(tmp, - parent_level - 1, &check.first_key, gen)) { - free_extent_buffer(tmp); - return -EUCLEAN; + if (btrfs_verify_level_key(tmp, &check)) { + ret = -EUCLEAN; + goto out; } *eb_ret = tmp; - return 0; + tmp = NULL; + ret = 0; + goto out; } if (p->nowait) { - free_extent_buffer(tmp); - return -EAGAIN; + ret = -EAGAIN; + goto out; } - if (unlock_up) - btrfs_unlock_up_safe(p, level + 1); - - /* now we're allowed to do a blocking uptodate check */ - ret = btrfs_read_extent_buffer(tmp, &check); - if (ret) { - free_extent_buffer(tmp); + if (!p->skip_locking) { + btrfs_unlock_up_safe(p, parent_level + 1); + tmp_locked = true; + btrfs_tree_read_lock(tmp); btrfs_release_path(p); - return ret; + ret = -EAGAIN; + path_released = true; } - if (unlock_up) - ret = -EAGAIN; + /* Now we're allowed to do a blocking uptodate check. */ + err = btrfs_read_extent_buffer(tmp, &check); + if (err) { + ret = err; + goto out; + } + if (ret == 0) { + ASSERT(!tmp_locked); + *eb_ret = tmp; + tmp = NULL; + } goto out; } else if (p->nowait) { - return -EAGAIN; + ret = -EAGAIN; + goto out; } - if (unlock_up) { - btrfs_unlock_up_safe(p, level + 1); + if (!p->skip_locking) { + btrfs_unlock_up_safe(p, parent_level + 1); ret = -EAGAIN; - } else { - ret = 0; } if (p->reada != READA_NONE) - reada_for_search(fs_info, p, level, slot, key->objectid); + reada_for_search(fs_info, p, parent_level, slot, key->objectid); - tmp = read_tree_block(fs_info, blocknr, &check); + tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level); if (IS_ERR(tmp)) { + ret = PTR_ERR(tmp); + tmp = NULL; + goto out; + } + read_tmp = true; + + if (!p->skip_locking) { + ASSERT(ret == -EAGAIN); + tmp_locked = true; + btrfs_tree_read_lock(tmp); btrfs_release_path(p); - return PTR_ERR(tmp); + path_released = true; + } + + /* Now we're allowed to do a blocking uptodate check. */ + err = btrfs_read_extent_buffer(tmp, &check); + if (err) { + ret = err; + goto out; } + /* * If the read above didn't mark this buffer up to date, * it will never end up being up to date. Set ret to EIO now * and give up so that our caller doesn't loop forever * on our EAGAINs. */ - if (!extent_buffer_uptodate(tmp)) + if (!extent_buffer_uptodate(tmp)) { ret = -EIO; + goto out; + } -out: if (ret == 0) { + ASSERT(!tmp_locked); *eb_ret = tmp; - } else { - free_extent_buffer(tmp); - btrfs_release_path(p); + tmp = NULL; + } +out: + if (tmp) { + if (tmp_locked) + btrfs_tree_read_unlock(tmp); + if (read_tmp && ret && ret != -EAGAIN) + free_extent_buffer_stale(tmp); + else + free_extent_buffer(tmp); } + if (ret && !path_released) + btrfs_release_path(p); return ret; } @@ -2197,8 +2233,8 @@ cow_done: goto done; } - err = read_block_for_search(root, p, &b, level, slot, key); - if (err == -EAGAIN) + err = read_block_for_search(root, p, &b, slot, key); + if (err == -EAGAIN && !p->nowait) goto again; if (err) { ret = err; @@ -2324,8 +2360,8 @@ again: goto done; } - err = read_block_for_search(root, p, &b, level, slot, key); - if (err == -EAGAIN) + err = read_block_for_search(root, p, &b, slot, key); + if (err == -EAGAIN && !p->nowait) goto again; if (err) { ret = err; @@ -2334,7 +2370,7 @@ again: level = btrfs_header_level(b); btrfs_tree_read_lock(b); - b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq); + b = btrfs_tree_mod_log_rewind(fs_info, b, time_seq); if (!b) { ret = -ENOMEM; goto done; @@ -4930,8 +4966,7 @@ again: } next = c; - ret = read_block_for_search(root, path, &next, level, - slot, &key); + ret = read_block_for_search(root, path, &next, slot, &key); if (ret == -EAGAIN && !path->nowait) goto again; @@ -4974,8 +5009,7 @@ again: if (!level) break; - ret = read_block_for_search(root, path, &next, level, - 0, &key); + ret = read_block_for_search(root, path, &next, 0, &key); if (ret == -EAGAIN && !path->nowait) goto again; diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 7cfefdfe54ea..f4d9feac0d0e 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -64,9 +64,9 @@ struct btrfs_delayed_node { struct mutex mutex; struct btrfs_inode_item inode_item; refcount_t refs; + int count; u64 index_cnt; unsigned long flags; - int count; /* * The size of the next batch of dir index items to insert (if this * node is from a directory inode). Protected by @mutex. diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index cab94d141f66..0d878dbbabba 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -9,6 +9,7 @@ #include "messages.h" #include "ctree.h" #include "delayed-ref.h" +#include "extent-tree.h" #include "transaction.h" #include "qgroup.h" #include "space-info.h" @@ -313,39 +314,6 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1, return 0; } -/* insert a new ref to head ref rbtree */ -static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root, - struct rb_node *node) -{ - struct rb_node **p = &root->rb_root.rb_node; - struct rb_node *parent_node = NULL; - struct btrfs_delayed_ref_head *entry; - struct btrfs_delayed_ref_head *ins; - u64 bytenr; - bool leftmost = true; - - ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); - bytenr = ins->bytenr; - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, - href_node); - - if (bytenr < entry->bytenr) { - p = &(*p)->rb_left; - } else if (bytenr > entry->bytenr) { - p = &(*p)->rb_right; - leftmost = false; - } else { - return entry; - } - } - - rb_link_node(node, parent_node, p); - rb_insert_color_cached(node, root, leftmost); - return NULL; -} - static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, struct btrfs_delayed_ref_node *ins) { @@ -380,75 +348,32 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, static struct btrfs_delayed_ref_head *find_first_ref_head( struct btrfs_delayed_ref_root *dr) { - struct rb_node *n; - struct btrfs_delayed_ref_head *entry; + unsigned long from = 0; - n = rb_first_cached(&dr->href_root); - if (!n) - return NULL; + lockdep_assert_held(&dr->lock); - entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); - - return entry; + return xa_find(&dr->head_refs, &from, ULONG_MAX, XA_PRESENT); } -/* - * Find a head entry based on bytenr. This returns the delayed ref head if it - * was able to find one, or NULL if nothing was in that spot. If return_bigger - * is given, the next bigger entry is returned if no exact match is found. - */ -static struct btrfs_delayed_ref_head *find_ref_head( - struct btrfs_delayed_ref_root *dr, u64 bytenr, - bool return_bigger) -{ - struct rb_root *root = &dr->href_root.rb_root; - struct rb_node *n; - struct btrfs_delayed_ref_head *entry; - - n = root->rb_node; - entry = NULL; - while (n) { - entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); - - if (bytenr < entry->bytenr) - n = n->rb_left; - else if (bytenr > entry->bytenr) - n = n->rb_right; - else - return entry; - } - if (entry && return_bigger) { - if (bytenr > entry->bytenr) { - n = rb_next(&entry->href_node); - if (!n) - return NULL; - entry = rb_entry(n, struct btrfs_delayed_ref_head, - href_node); - } - return entry; - } - return NULL; -} - -int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head) +static bool btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) { lockdep_assert_held(&delayed_refs->lock); if (mutex_trylock(&head->mutex)) - return 0; + return true; refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); spin_lock(&delayed_refs->lock); - if (RB_EMPTY_NODE(&head->href_node)) { + if (!head->tracked) { mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); - return -EAGAIN; + return false; } btrfs_put_delayed_ref_head(head); - return 0; + return true; } static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info, @@ -462,7 +387,6 @@ static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info, if (!list_empty(&ref->add_list)) list_del(&ref->add_list); btrfs_put_delayed_ref(ref); - atomic_dec(&delayed_refs->num_entries); btrfs_delayed_refs_rsv_release(fs_info, 1, 0); } @@ -558,33 +482,31 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) } struct btrfs_delayed_ref_head *btrfs_select_ref_head( + const struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs) { struct btrfs_delayed_ref_head *head; + unsigned long start_index; + unsigned long found_index; + bool found_head = false; + bool locked; - lockdep_assert_held(&delayed_refs->lock); + spin_lock(&delayed_refs->lock); again: - head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, - true); - if (!head && delayed_refs->run_delayed_start != 0) { - delayed_refs->run_delayed_start = 0; - head = find_first_ref_head(delayed_refs); + start_index = (delayed_refs->run_delayed_start >> fs_info->sectorsize_bits); + xa_for_each_start(&delayed_refs->head_refs, found_index, head, start_index) { + if (!head->processing) { + found_head = true; + break; + } } - if (!head) - return NULL; - - while (head->processing) { - struct rb_node *node; - - node = rb_next(&head->href_node); - if (!node) { - if (delayed_refs->run_delayed_start == 0) - return NULL; - delayed_refs->run_delayed_start = 0; - goto again; + if (!found_head) { + if (delayed_refs->run_delayed_start == 0) { + spin_unlock(&delayed_refs->lock); + return NULL; } - head = rb_entry(node, struct btrfs_delayed_ref_head, - href_node); + delayed_refs->run_delayed_start = 0; + goto again; } head->processing = true; @@ -592,18 +514,42 @@ again: delayed_refs->num_heads_ready--; delayed_refs->run_delayed_start = head->bytenr + head->num_bytes; + + locked = btrfs_delayed_ref_lock(delayed_refs, head); + spin_unlock(&delayed_refs->lock); + + /* + * We may have dropped the spin lock to get the head mutex lock, and + * that might have given someone else time to free the head. If that's + * true, it has been removed from our list and we can move on. + */ + if (!locked) + return ERR_PTR(-EAGAIN); + return head; } -void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, +void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + spin_lock(&delayed_refs->lock); + head->processing = false; + delayed_refs->num_heads_ready++; + spin_unlock(&delayed_refs->lock); + btrfs_delayed_ref_unlock(head); +} + +void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { + const unsigned long index = (head->bytenr >> fs_info->sectorsize_bits); + lockdep_assert_held(&delayed_refs->lock); lockdep_assert_held(&head->lock); - rb_erase_cached(&head->href_node, &delayed_refs->href_root); - RB_CLEAR_NODE(&head->href_node); - atomic_dec(&delayed_refs->num_entries); + xa_erase(&delayed_refs->head_refs, index); + head->tracked = false; delayed_refs->num_heads--; if (!head->processing) delayed_refs->num_heads_ready--; @@ -629,7 +575,6 @@ static bool insert_delayed_ref(struct btrfs_trans_handle *trans, if (!exist) { if (ref->action == BTRFS_ADD_DELAYED_REF) list_add_tail(&ref->add_list, &href->ref_add_list); - atomic_inc(&root->num_entries); spin_unlock(&href->lock); trans->delayed_ref_updates++; return false; @@ -813,7 +758,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, head_ref->is_system = (generic_ref->ref_root == BTRFS_CHUNK_TREE_OBJECTID); head_ref->ref_tree = RB_ROOT_CACHED; INIT_LIST_HEAD(&head_ref->ref_add_list); - RB_CLEAR_NODE(&head_ref->href_node); + head_ref->tracked = false; head_ref->processing = false; head_ref->total_ref_mod = count_mod; spin_lock_init(&head_ref->lock); @@ -830,7 +775,6 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, qrecord->data_rsv = reserved; qrecord->data_rsv_refroot = generic_ref->ref_root; } - qrecord->bytenr = generic_ref->bytenr; qrecord->num_bytes = generic_ref->num_bytes; qrecord->old_roots = NULL; } @@ -852,19 +796,33 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_root *delayed_refs; + const unsigned long index = (head_ref->bytenr >> fs_info->sectorsize_bits); bool qrecord_inserted = false; delayed_refs = &trans->transaction->delayed_refs; + lockdep_assert_held(&delayed_refs->lock); + +#if BITS_PER_LONG == 32 + if (head_ref->bytenr >= MAX_LFS_FILESIZE) { + if (qrecord) + xa_release(&delayed_refs->dirty_extents, index); + btrfs_err_rl(fs_info, +"delayed ref head %llu is beyond 32bit page cache and xarray index limit", + head_ref->bytenr); + btrfs_err_32bit_limit(fs_info); + return ERR_PTR(-EOVERFLOW); + } +#endif /* Record qgroup extent info if provided */ if (qrecord) { int ret; - ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord); + ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord, + head_ref->bytenr); if (ret) { /* Clean up if insertion fails or item exists. */ - xa_release(&delayed_refs->dirty_extents, - qrecord->bytenr >> fs_info->sectorsize_bits); + xa_release(&delayed_refs->dirty_extents, index); /* Caller responsible for freeing qrecord on error. */ if (ret < 0) return ERR_PTR(ret); @@ -876,8 +834,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, trace_add_delayed_ref_head(fs_info, head_ref, action); - existing = htree_insert(&delayed_refs->href_root, - &head_ref->href_node); + existing = xa_load(&delayed_refs->head_refs, index); if (existing) { update_existing_head_ref(trans, existing, head_ref); /* @@ -887,6 +844,19 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { + existing = xa_store(&delayed_refs->head_refs, index, head_ref, GFP_ATOMIC); + if (xa_is_err(existing)) { + /* Memory was preallocated by the caller. */ + ASSERT(xa_err(existing) != -ENOMEM); + return ERR_PTR(xa_err(existing)); + } else if (WARN_ON(existing)) { + /* + * Shouldn't happen we just did a lookup before under + * delayed_refs->lock. + */ + return ERR_PTR(-EEXIST); + } + head_ref->tracked = true; /* * We reserve the amount of bytes needed to delete csums when * adding the ref head and not when adding individual drop refs @@ -900,7 +870,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, } delayed_refs->num_heads++; delayed_refs->num_heads_ready++; - atomic_inc(&delayed_refs->num_entries); } if (qrecord_inserted_ret) *qrecord_inserted_ret = qrecord_inserted; @@ -1008,6 +977,8 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *new_head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; + const unsigned long index = (generic_ref->bytenr >> fs_info->sectorsize_bits); + bool qrecord_reserved = false; bool qrecord_inserted; int action = generic_ref->action; bool merged; @@ -1023,25 +994,32 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, goto free_node; } + delayed_refs = &trans->transaction->delayed_refs; + if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { ret = -ENOMEM; goto free_head_ref; } - if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, - generic_ref->bytenr >> fs_info->sectorsize_bits, - GFP_NOFS)) { + if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) { ret = -ENOMEM; goto free_record; } + qrecord_reserved = true; + } + + ret = xa_reserve(&delayed_refs->head_refs, index, GFP_NOFS); + if (ret) { + if (qrecord_reserved) + xa_release(&delayed_refs->dirty_extents, index); + goto free_record; } init_delayed_ref_common(fs_info, node, generic_ref); init_delayed_ref_head(head_ref, generic_ref, record, reserved); head_ref->extent_op = extent_op; - delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); /* @@ -1051,6 +1029,7 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, new_head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); if (IS_ERR(new_head_ref)) { + xa_release(&delayed_refs->head_refs, index); spin_unlock(&delayed_refs->lock); ret = PTR_ERR(new_head_ref); goto free_record; @@ -1074,7 +1053,7 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_delayed_ref_node_cachep, node); if (qrecord_inserted) - return btrfs_qgroup_trace_extent_post(trans, record); + return btrfs_qgroup_trace_extent_post(trans, record, generic_ref->bytenr); return 0; free_record: @@ -1113,6 +1092,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u8 level, struct btrfs_delayed_extent_op *extent_op) { + const unsigned long index = (bytenr >> trans->fs_info->sectorsize_bits); struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_head *head_ref_ret; struct btrfs_delayed_ref_root *delayed_refs; @@ -1123,6 +1103,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, .num_bytes = num_bytes, .tree_ref.level = level, }; + int ret; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) @@ -1132,16 +1113,23 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); + ret = xa_reserve(&delayed_refs->head_refs, index, GFP_NOFS); + if (ret) { + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); + return ret; + } + + spin_lock(&delayed_refs->lock); head_ref_ret = add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD, NULL); - spin_unlock(&delayed_refs->lock); - if (IS_ERR(head_ref_ret)) { + xa_release(&delayed_refs->head_refs, index); + spin_unlock(&delayed_refs->lock); kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); return PTR_ERR(head_ref_ret); } + spin_unlock(&delayed_refs->lock); /* * Need to update the delayed_refs_rsv with any changes we may have @@ -1164,11 +1152,15 @@ void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) * head node if found, or NULL if not. */ struct btrfs_delayed_ref_head * -btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) +btrfs_find_delayed_ref_head(const struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + u64 bytenr) { + const unsigned long index = (bytenr >> fs_info->sectorsize_bits); + lockdep_assert_held(&delayed_refs->lock); - return find_ref_head(delayed_refs, bytenr, false); + return xa_load(&delayed_refs->head_refs, index); } static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent) @@ -1238,6 +1230,81 @@ bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, return found; } +void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) +{ + struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs; + struct btrfs_fs_info *fs_info = trans->fs_info; + + spin_lock(&delayed_refs->lock); + while (true) { + struct btrfs_delayed_ref_head *head; + struct rb_node *n; + bool pin_bytes = false; + + head = find_first_ref_head(delayed_refs); + if (!head) + break; + + if (!btrfs_delayed_ref_lock(delayed_refs, head)) + continue; + + spin_lock(&head->lock); + while ((n = rb_first_cached(&head->ref_tree)) != NULL) { + struct btrfs_delayed_ref_node *ref; + + ref = rb_entry(n, struct btrfs_delayed_ref_node, ref_node); + drop_delayed_ref(fs_info, delayed_refs, head, ref); + } + if (head->must_insert_reserved) + pin_bytes = true; + btrfs_free_delayed_extent_op(head->extent_op); + btrfs_delete_ref_head(fs_info, delayed_refs, head); + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); + mutex_unlock(&head->mutex); + + if (pin_bytes) { + struct btrfs_block_group *bg; + + bg = btrfs_lookup_block_group(fs_info, head->bytenr); + if (WARN_ON_ONCE(bg == NULL)) { + /* + * Unexpected and there's nothing we can do here + * because we are in a transaction abort path, + * so any errors can only be ignored or reported + * while attempting to cleanup all resources. + */ + btrfs_err(fs_info, +"block group for delayed ref at %llu was not found while destroying ref head", + head->bytenr); + } else { + spin_lock(&bg->space_info->lock); + spin_lock(&bg->lock); + bg->pinned += head->num_bytes; + btrfs_space_info_update_bytes_pinned(fs_info, + bg->space_info, + head->num_bytes); + bg->reserved -= head->num_bytes; + bg->space_info->bytes_reserved -= head->num_bytes; + spin_unlock(&bg->lock); + spin_unlock(&bg->space_info->lock); + + btrfs_put_block_group(bg); + } + + btrfs_error_unpin_extent_range(fs_info, head->bytenr, + head->bytenr + head->num_bytes - 1); + } + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); + btrfs_put_delayed_ref_head(head); + cond_resched(); + spin_lock(&delayed_refs->lock); + } + btrfs_qgroup_destroy_extent_records(trans); + + spin_unlock(&delayed_refs->lock); +} + void __cold btrfs_delayed_ref_exit(void) { kmem_cache_destroy(btrfs_delayed_ref_head_cachep); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 352921e76c74..611fb3388f82 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -61,7 +61,8 @@ struct btrfs_delayed_ref_node { /* * If action is BTRFS_ADD_DELAYED_REF, also link this node to * ref_head->ref_add_list, then we do not need to iterate the - * whole ref_head->ref_list to find BTRFS_ADD_DELAYED_REF nodes. + * refs rbtree in the corresponding delayed ref head + * (struct btrfs_delayed_ref_head::ref_tree). */ struct list_head add_list; @@ -123,12 +124,6 @@ struct btrfs_delayed_ref_head { u64 bytenr; u64 num_bytes; /* - * For insertion into struct btrfs_delayed_ref_root::href_root. - * Keep it in the same cache line as 'bytenr' for more efficient - * searches in the rbtree. - */ - struct rb_node href_node; - /* * the mutex is held while running the refs, and it is also * held when checking the sum of reference modifications. */ @@ -191,6 +186,11 @@ struct btrfs_delayed_ref_head { bool is_data; bool is_system; bool processing; + /* + * Indicate if it's currently in the data structure that tracks head + * refs (struct btrfs_delayed_ref_root::head_refs). + */ + bool tracked; }; enum btrfs_delayed_ref_flags { @@ -199,38 +199,52 @@ enum btrfs_delayed_ref_flags { }; struct btrfs_delayed_ref_root { - /* head ref rbtree */ - struct rb_root_cached href_root; - /* - * Track dirty extent records. + * Track head references. * The keys correspond to the logical address of the extent ("bytenr") * right shifted by fs_info->sectorsize_bits. This is both to get a more * dense index space (optimizes xarray structure) and because indexes in * xarrays are of "unsigned long" type, meaning they are 32 bits wide on * 32 bits platforms, limiting the extent range to 4G which is too low * and makes it unusable (truncated index values) on 32 bits platforms. + * Protected by the spinlock 'lock' defined below. */ - struct xarray dirty_extents; + struct xarray head_refs; - /* this spin lock protects the rbtree and the entries inside */ - spinlock_t lock; + /* + * Track dirty extent records. + * The keys correspond to the logical address of the extent ("bytenr") + * right shifted by fs_info->sectorsize_bits, for same reasons as above. + */ + struct xarray dirty_extents; - /* how many delayed ref updates we've queued, used by the - * throttling code + /* + * Protects the xarray head_refs, its entries and the following fields: + * num_heads, num_heads_ready, pending_csums and run_delayed_start. */ - atomic_t num_entries; + spinlock_t lock; - /* total number of head nodes in tree */ + /* Total number of head refs, protected by the spinlock 'lock'. */ unsigned long num_heads; - /* total number of head nodes ready for processing */ + /* + * Total number of head refs ready for processing, protected by the + * spinlock 'lock'. + */ unsigned long num_heads_ready; + /* + * Track space reserved for deleting csums of data extents. + * Protected by the spinlock 'lock'. + */ u64 pending_csums; unsigned long flags; + /* + * Track from which bytenr to start searching ref heads. + * Protected by the spinlock 'lock'. + */ u64 run_delayed_start; /* @@ -372,19 +386,22 @@ void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head); struct btrfs_delayed_ref_head * -btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, +btrfs_find_delayed_ref_head(const struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr); -int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head); static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) { mutex_unlock(&head->mutex); } -void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, +void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); struct btrfs_delayed_ref_head *btrfs_select_ref_head( + const struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs); +void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); @@ -399,6 +416,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, u64 root, u64 parent); +void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans); static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node) { diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 83d5cdd77f29..ac8e97ed13f7 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -45,7 +45,7 @@ * * - Copy existing extents * - * This happens by re-using scrub facility, as scrub also iterates through + * This happens by reusing scrub facility, as scrub also iterates through * existing extents from commit root. * * Location: scrub_write_block_to_dev_replace() from @@ -641,6 +641,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return ret; down_write(&dev_replace->rwsem); + dev_replace->replace_task = current; switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -994,6 +995,7 @@ error: list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list); fs_devices->rw_devices++; + dev_replace->replace_task = NULL; up_write(&dev_replace->rwsem); btrfs_rm_dev_replace_blocked(fs_info); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 1e8cd7c9472e..1ea5d8fcfbf7 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -27,7 +27,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle const char *name, int name_len) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret; char *ptr; struct extent_buffer *leaf; @@ -35,7 +34,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); if (ret == -EEXIST) { struct btrfs_dir_item *di; - di = btrfs_match_dir_item_name(fs_info, path, name, name_len); + di = btrfs_match_dir_item_name(path, name, name_len); if (di) return ERR_PTR(-EEXIST); btrfs_extend_item(trans, path, data_size); @@ -190,7 +189,7 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir( if (ret > 0) return ERR_PTR(-ENOENT); - return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); + return btrfs_match_dir_item_name(path, name, name_len); } /* @@ -341,8 +340,7 @@ btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) break; - di = btrfs_match_dir_item_name(root->fs_info, path, - name->name, name->len); + di = btrfs_match_dir_item_name(path, name->name, name->len); if (di) return di; } @@ -378,8 +376,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, * this walks through all the entries in a dir item and finds one * for a specific name. */ -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, - const struct btrfs_path *path, +struct btrfs_dir_item *btrfs_match_dir_item_name(const struct btrfs_path *path, const char *name, int name_len) { struct btrfs_dir_item *dir_item; diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h index 5f6dfafc91f1..28d69970bc70 100644 --- a/fs/btrfs/dir-item.h +++ b/fs/btrfs/dir-item.h @@ -44,8 +44,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 dir, const char *name, u16 name_len, int mod); -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, - const struct btrfs_path *path, +struct btrfs_dir_item *btrfs_match_dir_item_name(const struct btrfs_path *path, const char *name, int name_len); diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index bd38df5647e3..a7c3e221378d 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -834,7 +834,7 @@ relock: return ret; } - ret = btrfs_write_check(iocb, from, ret); + ret = btrfs_write_check(iocb, ret); if (ret < 0) { btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto out; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b11bfe68dd65..814320948645 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -917,8 +917,7 @@ fail: return ERR_PTR(ret); } -static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +static struct btrfs_root *alloc_log_tree(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; @@ -966,7 +965,7 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, { struct btrfs_root *log_root; - log_root = alloc_log_tree(trans, fs_info); + log_root = alloc_log_tree(fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); @@ -992,7 +991,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_inode_item *inode_item; int ret; - log_root = alloc_log_tree(trans, fs_info); + log_root = alloc_log_tree(fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); @@ -2786,6 +2785,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) btrfs_init_scrub(fs_info); btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(fs_info); + btrfs_init_extent_map_shrinker_work(fs_info); rwlock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT_CACHED; @@ -2852,8 +2852,6 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block if (ret) return ret; - spin_lock_init(&fs_info->extent_map_shrinker_lock); - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) return ret; @@ -3202,8 +3200,7 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) return 0; } -int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, - const char *options) +int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices) { u32 sectorsize; u32 nodesize; @@ -4186,7 +4183,7 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) btrfs_warn(fs_info, "transaction %llu (with %llu dirty metadata bytes) is not committed", trans->transid, dirty_bytes); - btrfs_cleanup_one_transaction(trans, fs_info); + btrfs_cleanup_one_transaction(trans); if (trans == fs_info->running_transaction) fs_info->running_transaction = NULL; @@ -4294,6 +4291,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); + cancel_work_sync(&fs_info->em_shrinker_work); /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); @@ -4531,75 +4529,6 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL); } -static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, - struct btrfs_fs_info *fs_info) -{ - struct rb_node *node; - struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs; - struct btrfs_delayed_ref_node *ref; - - spin_lock(&delayed_refs->lock); - while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) { - struct btrfs_delayed_ref_head *head; - struct rb_node *n; - bool pin_bytes = false; - - head = rb_entry(node, struct btrfs_delayed_ref_head, - href_node); - if (btrfs_delayed_ref_lock(delayed_refs, head)) - continue; - - spin_lock(&head->lock); - while ((n = rb_first_cached(&head->ref_tree)) != NULL) { - ref = rb_entry(n, struct btrfs_delayed_ref_node, - ref_node); - rb_erase_cached(&ref->ref_node, &head->ref_tree); - RB_CLEAR_NODE(&ref->ref_node); - if (!list_empty(&ref->add_list)) - list_del(&ref->add_list); - atomic_dec(&delayed_refs->num_entries); - btrfs_put_delayed_ref(ref); - btrfs_delayed_refs_rsv_release(fs_info, 1, 0); - } - if (head->must_insert_reserved) - pin_bytes = true; - btrfs_free_delayed_extent_op(head->extent_op); - btrfs_delete_ref_head(delayed_refs, head); - spin_unlock(&head->lock); - spin_unlock(&delayed_refs->lock); - mutex_unlock(&head->mutex); - - if (pin_bytes) { - struct btrfs_block_group *cache; - - cache = btrfs_lookup_block_group(fs_info, head->bytenr); - BUG_ON(!cache); - - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - cache->pinned += head->num_bytes; - btrfs_space_info_update_bytes_pinned(fs_info, - cache->space_info, head->num_bytes); - cache->reserved -= head->num_bytes; - cache->space_info->bytes_reserved -= head->num_bytes; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - - btrfs_put_block_group(cache); - - btrfs_error_unpin_extent_range(fs_info, head->bytenr, - head->bytenr + head->num_bytes - 1); - } - btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); - btrfs_put_delayed_ref_head(head); - cond_resched(); - spin_lock(&delayed_refs->lock); - } - btrfs_qgroup_destroy_extent_records(trans); - - spin_unlock(&delayed_refs->lock); -} - static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) { struct btrfs_inode *btrfs_inode; @@ -4805,9 +4734,9 @@ static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->fs_roots_radix_lock); } -void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, - struct btrfs_fs_info *fs_info) +void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans) { + struct btrfs_fs_info *fs_info = cur_trans->fs_info; struct btrfs_device *dev, *tmp; btrfs_cleanup_dirty_bgs(cur_trans, fs_info); @@ -4819,7 +4748,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, list_del_init(&dev->post_commit_list); } - btrfs_destroy_delayed_refs(cur_trans, fs_info); + btrfs_destroy_delayed_refs(cur_trans); cur_trans->state = TRANS_STATE_COMMIT_START; wake_up(&fs_info->transaction_blocked_wait); @@ -4865,7 +4794,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) } else { spin_unlock(&fs_info->trans_lock); } - btrfs_cleanup_one_transaction(t, fs_info); + btrfs_cleanup_one_transaction(t); spin_lock(&fs_info->trans_lock); if (t == fs_info->running_transaction) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 99af64d3f277..a7051e2570c1 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -52,8 +52,7 @@ struct extent_buffer *btrfs_find_create_tree_block( int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, const struct btrfs_super_block *disk_sb); -int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, - const char *options); +int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices); void __cold close_ctree(struct btrfs_fs_info *fs_info); int btrfs_validate_super(const struct btrfs_fs_info *fs_info, const struct btrfs_super_block *sb, int mirror_num); @@ -127,8 +126,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); -void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, - struct btrfs_fs_info *fs_info); +void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans); struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d9f511babd89..412e318e4a22 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -182,7 +182,7 @@ search_again: delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { refcount_inc(&head->refs); @@ -795,7 +795,6 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, if (insert) { extra_size = btrfs_extent_inline_ref_size(want); path->search_for_extension = 1; - path->keep_locks = 1; } else extra_size = -1; @@ -946,6 +945,25 @@ again: ret = -EAGAIN; goto out; } + + if (path->slots[0] + 1 < btrfs_header_nritems(path->nodes[0])) { + struct btrfs_key tmp_key; + + btrfs_item_key_to_cpu(path->nodes[0], &tmp_key, path->slots[0] + 1); + if (tmp_key.objectid == bytenr && + tmp_key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { + ret = -EAGAIN; + goto out; + } + goto out_no_entry; + } + + if (!path->keep_locks) { + btrfs_release_path(path); + path->keep_locks = 1; + goto again; + } + /* * To add new inline back ref, we have to make sure * there is no corresponding back ref item. @@ -959,13 +977,15 @@ again: goto out; } } +out_no_entry: *ref_ret = (struct btrfs_extent_inline_ref *)ptr; out: - if (insert) { + if (path->keep_locks) { path->keep_locks = 0; - path->search_for_extension = 0; btrfs_unlock_up_safe(path, 1); } + if (insert) + path->search_for_extension = 0; return ret; } @@ -1807,16 +1827,6 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head) return ref; } -static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head) -{ - spin_lock(&delayed_refs->lock); - head->processing = false; - delayed_refs->num_heads_ready++; - spin_unlock(&delayed_refs->lock); - btrfs_delayed_ref_unlock(head); -} - static struct btrfs_delayed_extent_op *cleanup_extent_op( struct btrfs_delayed_ref_head *head) { @@ -1891,7 +1901,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, ret = run_and_cleanup_extent_op(trans, head); if (ret < 0) { - unselect_delayed_ref_head(delayed_refs, head); + btrfs_unselect_ref_head(delayed_refs, head); btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); return ret; } else if (ret) { @@ -1910,7 +1920,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); return 1; } - btrfs_delete_ref_head(delayed_refs, head); + btrfs_delete_ref_head(fs_info, delayed_refs, head); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); @@ -1933,39 +1943,6 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, return ret; } -static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( - struct btrfs_trans_handle *trans) -{ - struct btrfs_delayed_ref_root *delayed_refs = - &trans->transaction->delayed_refs; - struct btrfs_delayed_ref_head *head = NULL; - int ret; - - spin_lock(&delayed_refs->lock); - head = btrfs_select_ref_head(delayed_refs); - if (!head) { - spin_unlock(&delayed_refs->lock); - return head; - } - - /* - * Grab the lock that says we are going to process all the refs for - * this head - */ - ret = btrfs_delayed_ref_lock(delayed_refs, head); - spin_unlock(&delayed_refs->lock); - - /* - * We may have dropped the spin lock to get the head mutex lock, and - * that might have given someone else time to free the head. If that's - * true, it has been removed from our list and we can move on. - */ - if (ret == -EAGAIN) - head = ERR_PTR(-EAGAIN); - - return head; -} - static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *locked_ref, u64 *bytes_released) @@ -1986,7 +1963,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, if (ref->seq && btrfs_check_delayed_seq(fs_info, ref->seq)) { spin_unlock(&locked_ref->lock); - unselect_delayed_ref_head(delayed_refs, locked_ref); + btrfs_unselect_ref_head(delayed_refs, locked_ref); return -EAGAIN; } @@ -2009,7 +1986,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, default: WARN_ON(1); } - atomic_dec(&delayed_refs->num_entries); /* * Record the must_insert_reserved flag before we drop the @@ -2035,7 +2011,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, btrfs_free_delayed_extent_op(extent_op); if (ret) { - unselect_delayed_ref_head(delayed_refs, locked_ref); + btrfs_unselect_ref_head(delayed_refs, locked_ref); btrfs_put_delayed_ref(ref); return ret; } @@ -2073,7 +2049,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, do { if (!locked_ref) { - locked_ref = btrfs_obtain_ref_head(trans); + locked_ref = btrfs_select_ref_head(fs_info, delayed_refs); if (IS_ERR_OR_NULL(locked_ref)) { if (PTR_ERR(locked_ref) == -EAGAIN) { continue; @@ -2220,7 +2196,7 @@ again: btrfs_create_pending_block_groups(trans); spin_lock(&delayed_refs->lock); - if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) { + if (xa_empty(&delayed_refs->head_refs)) { spin_unlock(&delayed_refs->lock); return 0; } @@ -2275,7 +2251,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, delayed_refs = &cur_trans->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr); if (!head) { spin_unlock(&delayed_refs->lock); btrfs_put_transaction(cur_trans); @@ -3144,7 +3120,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, break; } - /* Quick path didn't find the EXTEMT/METADATA_ITEM */ + /* Quick path didn't find the EXTENT/METADATA_ITEM */ if (path->slots[0] - extent_slot > 5) break; extent_slot--; @@ -3377,13 +3353,14 @@ out: static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, u64 bytenr) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; int ret = 0; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr); if (!head) goto out_delayed_unlock; @@ -3401,7 +3378,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (!mutex_trylock(&head->mutex)) goto out; - btrfs_delete_ref_head(delayed_refs, head); + btrfs_delete_ref_head(fs_info, delayed_refs, head); head->processing = false; spin_unlock(&head->lock); @@ -3411,7 +3388,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (head->must_insert_reserved) ret = 1; - btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head); + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); return ret; @@ -5270,7 +5247,7 @@ struct walk_control { * corrupted file systems must have been caught before calling this function. */ static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc, - struct extent_buffer *eb, u64 refs, u64 flags, int slot) + struct extent_buffer *eb, u64 flags, int slot) { struct btrfs_key key; u64 generation; @@ -5384,7 +5361,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, continue; /* If we don't need to visit this node don't reada. */ - if (!visit_node_for_delete(root, wc, eb, refs, flags, slot)) + if (!visit_node_for_delete(root, wc, eb, flags, slot)) continue; reada: btrfs_readahead_node_child(eb, slot); @@ -5518,7 +5495,7 @@ again: */ delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr); if (!head) goto out; if (!mutex_trylock(&head->mutex)) { @@ -5737,8 +5714,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, /* If we don't have to walk into this node skip it. */ if (!visit_node_for_delete(root, wc, path->nodes[level], - wc->refs[level - 1], wc->flags[level - 1], - path->slots[level])) + wc->flags[level - 1], path->slots[level])) goto skip; /* diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 42c9899d9241..b923d0cec61c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -190,7 +190,7 @@ static void process_one_folio(struct btrfs_fs_info *fs_info, btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len); if (folio != locked_folio && (page_ops & PAGE_UNLOCK)) - btrfs_folio_end_writer_lock(fs_info, folio, start, len); + btrfs_folio_end_lock(fs_info, folio, start, len); } static void __process_folios_contig(struct address_space *mapping, @@ -276,7 +276,7 @@ static noinline int lock_delalloc_folios(struct inode *inode, range_start = max_t(u64, folio_pos(folio), start); range_len = min_t(u64, folio_pos(folio) + folio_size(folio), end + 1) - range_start; - btrfs_folio_set_writer_lock(fs_info, folio, range_start, range_len); + btrfs_folio_set_lock(fs_info, folio, range_start, range_len); processed_end = range_start + range_len - 1; } @@ -438,7 +438,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le if (!btrfs_is_subpage(fs_info, folio->mapping)) folio_unlock(folio); else - btrfs_subpage_end_reader(fs_info, folio, start, len); + btrfs_folio_end_lock(fs_info, folio, start, len); } /* @@ -495,7 +495,7 @@ static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) return; ASSERT(folio_test_private(folio)); - btrfs_subpage_start_reader(fs_info, folio, folio_pos(folio), PAGE_SIZE); + btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE); } /* @@ -1102,6 +1102,45 @@ int btrfs_read_folio(struct file *file, struct folio *folio) return ret; } +static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bitmap, + u64 start, u32 len) +{ + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + const u64 folio_start = folio_pos(folio); + unsigned int start_bit; + unsigned int nbits; + + ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE); + start_bit = (start - folio_start) >> fs_info->sectorsize_bits; + nbits = len >> fs_info->sectorsize_bits; + ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits)); + bitmap_set(delalloc_bitmap, start_bit, nbits); +} + +static bool find_next_delalloc_bitmap(struct folio *folio, + unsigned long *delalloc_bitmap, u64 start, + u64 *found_start, u32 *found_len) +{ + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + const u64 folio_start = folio_pos(folio); + const unsigned int bitmap_size = fs_info->sectors_per_page; + unsigned int start_bit; + unsigned int first_zero; + unsigned int first_set; + + ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE); + + start_bit = (start - folio_start) >> fs_info->sectorsize_bits; + first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit); + if (first_set >= bitmap_size) + return false; + + *found_start = folio_start + (first_set << fs_info->sectorsize_bits); + first_zero = find_next_zero_bit(delalloc_bitmap, bitmap_size, first_set); + *found_len = (first_zero - first_set) << fs_info->sectorsize_bits; + return true; +} + /* * helper for extent_writepage(), doing all of the delayed allocation setup. * @@ -1121,6 +1160,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping); const u64 page_start = folio_pos(folio); const u64 page_end = page_start + folio_size(folio) - 1; + unsigned long delalloc_bitmap = 0; /* * Save the last found delalloc end. As the delalloc end can go beyond * page boundary, thus we cannot rely on subpage bitmap to locate the @@ -1131,6 +1171,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, u64 delalloc_end = page_end; u64 delalloc_to_write = 0; int ret = 0; + int bit; /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { @@ -1140,6 +1181,12 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, bio_ctrl->submit_bitmap = 1; } + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + u64 start = page_start + (bit << fs_info->sectorsize_bits); + + btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize); + } + /* Lock all (subpage) delalloc ranges inside the folio first. */ while (delalloc_start < page_end) { delalloc_end = page_end; @@ -1148,9 +1195,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, delalloc_start = delalloc_end + 1; continue; } - btrfs_folio_set_writer_lock(fs_info, folio, delalloc_start, - min(delalloc_end, page_end) + 1 - - delalloc_start); + set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start, + min(delalloc_end, page_end) + 1 - delalloc_start); last_delalloc_end = delalloc_end; delalloc_start = delalloc_end + 1; } @@ -1175,7 +1221,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, found_len = last_delalloc_end + 1 - found_start; found = true; } else { - found = btrfs_subpage_find_writer_locked(fs_info, folio, + found = find_next_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start, &found_start, &found_len); } if (!found) @@ -1314,7 +1360,7 @@ static int submit_one_sector(struct btrfs_inode *inode, * a folio for a range already written to disk. */ btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); - btrfs_set_range_writeback(inode, filepos, filepos + sectorsize - 1); + btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); /* * Above call should set the whole folio with writeback flag, even * just for a single subpage sector. @@ -1391,8 +1437,6 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, goto out; submitted_io = true; } - - btrfs_folio_assert_not_dirty(fs_info, folio, start, len); out: /* * If we didn't submitted any sector (>= i_size), folio dirty get @@ -1476,7 +1520,7 @@ done: * Only unlock ranges that are submitted. As there can be some async * submitted ranges inside the folio. */ - btrfs_folio_end_writer_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap); + btrfs_folio_end_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap); ASSERT(ret <= 0); return ret; } @@ -2115,7 +2159,27 @@ retry: continue; } - if (wbc->sync_mode != WB_SYNC_NONE) { + /* + * For subpage case, compression can lead to mixed + * writeback and dirty flags, e.g: + * 0 32K 64K 96K 128K + * | |//////||/////| |//| + * + * In above case, [32K, 96K) is asynchronously submitted + * for compression, and [124K, 128K) needs to be written back. + * + * If we didn't wait wrtiteback for page 64K, [128K, 128K) + * won't be submitted as the page still has writeback flag + * and will be skipped in the next check. + * + * This mixed writeback and dirty case is only possible for + * subpage case. + * + * TODO: Remove this check after migrating compression to + * regular submission. + */ + if (wbc->sync_mode != WB_SYNC_NONE || + btrfs_is_subpage(inode_to_fs_info(inode), mapping)) { if (folio_test_writeback(folio)) submit_write_bio(bio_ctrl, 0); folio_wait_writeback(folio); @@ -2200,7 +2264,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f u32 cur_len = cur_end + 1 - cur; struct folio *folio; - folio = __filemap_get_folio(mapping, cur >> PAGE_SHIFT, 0, 0); + folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT); /* * This shouldn't happen, the pages are pinned and locked, this @@ -2233,7 +2297,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f cur, cur_len, !ret); mapping_set_error(mapping, ret); } - btrfs_folio_end_writer_lock(fs_info, folio, cur, cur_len); + btrfs_folio_end_lock(fs_info, folio, cur, cur_len); if (ret < 0) found_error = true; next_page: @@ -2317,7 +2381,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree, * to drop the page. */ static bool try_release_extent_state(struct extent_io_tree *tree, - struct folio *folio, gfp_t mask) + struct folio *folio) { u64 start = folio_pos(folio); u64 end = start + PAGE_SIZE - 1; @@ -2428,7 +2492,7 @@ next: cond_resched(); } } - return try_release_extent_state(io_tree, folio, mask); + return try_release_extent_state(io_tree, folio); } static void __free_extent_buffer(struct extent_buffer *eb) @@ -2442,7 +2506,7 @@ static int extent_buffer_under_io(const struct extent_buffer *eb) test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); } -static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *folio) +static bool folio_range_has_eb(struct folio *folio) { struct btrfs_subpage *subpage; @@ -2452,12 +2516,6 @@ static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *foli subpage = folio_get_private(folio); if (atomic_read(&subpage->eb_refs)) return true; - /* - * Even there is no eb refs here, we may still have - * end_folio_read() call relying on page::private. - */ - if (atomic_read(&subpage->readers)) - return true; } return false; } @@ -2516,7 +2574,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * We can only detach the folio private if there are no other ebs in the * page range and no unfinished IO. */ - if (!folio_range_has_eb(fs_info, folio)) + if (!folio_range_has_eb(folio)) btrfs_detach_subpage(fs_info, folio); spin_unlock(&folio->mapping->i_private_lock); @@ -3121,7 +3179,7 @@ out: } /* * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, - * so it can be cleaned up without utlizing page->mapping. + * so it can be cleaned up without utilizing page->mapping. */ set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); @@ -4221,7 +4279,6 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, u64 gen, int level) { struct btrfs_tree_parent_check check = { - .has_first_key = 0, .level = level, .transid = gen }; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 1d93e1202c33..67ce85ff0ae2 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -77,10 +77,13 @@ static u64 range_end(u64 start, u64 len) return start + len; } -static void dec_evictable_extent_maps(struct btrfs_inode *inode) +static void remove_em(struct btrfs_inode *inode, struct extent_map *em) { struct btrfs_fs_info *fs_info = inode->root->fs_info; + rb_erase(&em->rb_node, &inode->extent_tree.root); + RB_CLEAR_NODE(&em->rb_node); + if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root))) percpu_counter_dec(&fs_info->evictable_extent_maps); } @@ -339,7 +342,6 @@ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_map_tree *tree = &inode->extent_tree; struct extent_map *merge = NULL; struct rb_node *rb; @@ -371,10 +373,8 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) em->flags |= EXTENT_FLAG_MERGED; validate_extent_map(fs_info, em); - rb_erase(&merge->rb_node, &tree->root); - RB_CLEAR_NODE(&merge->rb_node); + remove_em(inode, merge); free_extent_map(merge); - dec_evictable_extent_maps(inode); } } @@ -386,12 +386,10 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) merge_ondisk_extents(em, merge, em); validate_extent_map(fs_info, em); - rb_erase(&merge->rb_node, &tree->root); - RB_CLEAR_NODE(&merge->rb_node); em->generation = max(em->generation, merge->generation); em->flags |= EXTENT_FLAG_MERGED; + remove_em(inode, merge); free_extent_map(merge); - dec_evictable_extent_maps(inode); } } @@ -588,12 +586,10 @@ void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em) lockdep_assert_held_write(&tree->lock); WARN_ON(em->flags & EXTENT_FLAG_PINNED); - rb_erase(&em->rb_node, &tree->root); if (!(em->flags & EXTENT_FLAG_LOGGING)) list_del_init(&em->list); - RB_CLEAR_NODE(&em->rb_node); - dec_evictable_extent_maps(inode); + remove_em(inode, em); } static void replace_extent_mapping(struct btrfs_inode *inode, @@ -1122,13 +1118,12 @@ out_free_pre: struct btrfs_em_shrink_ctx { long nr_to_scan; long scanned; - u64 last_ino; - u64 last_root; }; static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx) { - const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info); struct extent_map_tree *tree = &inode->extent_tree; long nr_dropped = 0; struct rb_node *node; @@ -1201,7 +1196,8 @@ next: * lock. This is to avoid slowing other tasks trying to take the * lock. */ - if (need_resched() || rwlock_needbreak(&tree->lock)) + if (need_resched() || rwlock_needbreak(&tree->lock) || + btrfs_fs_closing(fs_info)) break; node = next; } @@ -1213,19 +1209,21 @@ next: static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_inode *inode; long nr_dropped = 0; - u64 min_ino = ctx->last_ino + 1; + u64 min_ino = fs_info->em_shrinker_last_ino + 1; inode = btrfs_find_first_inode(root, min_ino); while (inode) { nr_dropped += btrfs_scan_inode(inode, ctx); min_ino = btrfs_ino(inode) + 1; - ctx->last_ino = btrfs_ino(inode); + fs_info->em_shrinker_last_ino = btrfs_ino(inode); btrfs_add_delayed_iput(inode); - if (ctx->scanned >= ctx->nr_to_scan) + if (ctx->scanned >= ctx->nr_to_scan || + btrfs_fs_closing(inode->root->fs_info)) break; cond_resched(); @@ -1241,52 +1239,43 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx * inode if there is one or we will find out this was the last * one and move to the next root. */ - ctx->last_root = btrfs_root_id(root); + fs_info->em_shrinker_last_root = btrfs_root_id(root); } else { /* * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so * that when processing the next root we start from its first inode. */ - ctx->last_ino = 0; - ctx->last_root = btrfs_root_id(root) + 1; + fs_info->em_shrinker_last_ino = 0; + fs_info->em_shrinker_last_root = btrfs_root_id(root) + 1; } return nr_dropped; } -long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) +static void btrfs_extent_map_shrinker_worker(struct work_struct *work) { + struct btrfs_fs_info *fs_info; struct btrfs_em_shrink_ctx ctx; u64 start_root_id; u64 next_root_id; bool cycled = false; long nr_dropped = 0; - ctx.scanned = 0; - ctx.nr_to_scan = nr_to_scan; + fs_info = container_of(work, struct btrfs_fs_info, em_shrinker_work); - /* - * In case we have multiple tasks running this shrinker, make the next - * one start from the next inode in case it starts before we finish. - */ - spin_lock(&fs_info->extent_map_shrinker_lock); - ctx.last_ino = fs_info->extent_map_shrinker_last_ino; - fs_info->extent_map_shrinker_last_ino++; - ctx.last_root = fs_info->extent_map_shrinker_last_root; - spin_unlock(&fs_info->extent_map_shrinker_lock); + ctx.scanned = 0; + ctx.nr_to_scan = atomic64_read(&fs_info->em_shrinker_nr_to_scan); - start_root_id = ctx.last_root; - next_root_id = ctx.last_root; + start_root_id = fs_info->em_shrinker_last_root; + next_root_id = fs_info->em_shrinker_last_root; if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) { s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); - trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, - nr, ctx.last_root, - ctx.last_ino); + trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr); } - while (ctx.scanned < ctx.nr_to_scan) { + while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) { struct btrfs_root *root; unsigned long count; @@ -1300,8 +1289,8 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) spin_unlock(&fs_info->fs_roots_radix_lock); if (start_root_id > 0 && !cycled) { next_root_id = 0; - ctx.last_root = 0; - ctx.last_ino = 0; + fs_info->em_shrinker_last_root = 0; + fs_info->em_shrinker_last_ino = 0; cycled = true; continue; } @@ -1320,29 +1309,40 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) btrfs_put_root(root); } - /* - * In case of multiple tasks running this extent map shrinking code this - * isn't perfect but it's simple and silences things like KCSAN. It's - * not possible to know which task made more progress because we can - * cycle back to the first root and first inode if it's not the first - * time the shrinker ran, see the above logic. Also a task that started - * later may finish ealier than another task and made less progress. So - * make this simple and update to the progress of the last task that - * finished, with the occasional possiblity of having two consecutive - * runs of the shrinker process the same inodes. - */ - spin_lock(&fs_info->extent_map_shrinker_lock); - fs_info->extent_map_shrinker_last_ino = ctx.last_ino; - fs_info->extent_map_shrinker_last_root = ctx.last_root; - spin_unlock(&fs_info->extent_map_shrinker_lock); - if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) { s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); - trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, - nr, ctx.last_root, - ctx.last_ino); + trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr); } - return nr_dropped; + atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0); +} + +void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) +{ + /* + * Do nothing if the shrinker is already running. In case of high memory + * pressure we can have a lot of tasks calling us and all passing the + * same nr_to_scan value, but in reality we may need only to free + * nr_to_scan extent maps (or less). In case we need to free more than + * that, we will be called again by the fs shrinker, so no worries about + * not doing enough work to reclaim memory from extent maps. + * We can also be repeatedly called with the same nr_to_scan value + * simply because the shrinker runs asynchronously and multiple calls + * to this function are made before the shrinker does enough progress. + * + * That's why we set the atomic counter to nr_to_scan only if its + * current value is zero, instead of incrementing the counter by + * nr_to_scan. + */ + if (atomic64_cmpxchg(&fs_info->em_shrinker_nr_to_scan, 0, nr_to_scan) != 0) + return; + + queue_work(system_unbound_wq, &fs_info->em_shrinker_work); +} + +void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info) +{ + atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0); + INIT_WORK(&fs_info->em_shrinker_work, btrfs_extent_map_shrinker_worker); } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 5154a8f1d26c..cd123b266b64 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -189,6 +189,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, int btrfs_replace_extent_map_range(struct btrfs_inode *inode, struct extent_map *new_em, bool modified); -long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan); +void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan); +void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c index df7f09f3b02e..b80c07ad8c5e 100644 --- a/fs/btrfs/fiemap.c +++ b/fs/btrfs/fiemap.c @@ -186,7 +186,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, * we have in the cache is the last delalloc range we * found while the file extent item we found can be * either for a whole delalloc range we previously - * emmitted or only a part of that range. + * emitted or only a part of that range. * * We have two cases here: * @@ -194,13 +194,13 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, * cached extent's end. In this case just ignore the * current file extent item because we don't want to * overlap with previous ranges that may have been - * emmitted already; + * emitted already; * * 2) The file extent item starts behind the currently * cached extent but its end offset goes beyond the * end offset of the cached extent. We don't want to * overlap with a previous range that may have been - * emmitted already, so we emit the currently cached + * emitted already, so we emit the currently cached * extent and then partially store the current file * extent item's range in the cache, for the subrange * going the cached extent's end to the end of the diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e5384ceb8acf..588c353d2969 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -37,33 +37,30 @@ #include "file.h" #include "super.h" -/* simple helper to fault in pages and copy. This should go away - * and be replaced with calls into generic code. +/* + * Helper to fault in page and copy. This should go away and be replaced with + * calls into generic code. */ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, - struct page **prepared_pages, - struct iov_iter *i) + struct folio *folio, struct iov_iter *i) { size_t copied = 0; size_t total_copied = 0; - int pg = 0; int offset = offset_in_page(pos); while (write_bytes > 0) { - size_t count = min_t(size_t, - PAGE_SIZE - offset, write_bytes); - struct page *page = prepared_pages[pg]; + size_t count = min_t(size_t, PAGE_SIZE - offset, write_bytes); /* * Copy data from userspace to the current page */ - copied = copy_page_from_iter_atomic(page, offset, count, i); + copied = copy_folio_from_iter_atomic(folio, offset, count, i); /* Flush processor's dcache for this page */ - flush_dcache_page(page); + flush_dcache_folio(folio); /* * if we get a partial write, we can end up with - * partially up to date pages. These add + * partially up to date page. These add * a lot of complexity, so make sure they don't * happen by forcing this copy to be retried. * @@ -71,7 +68,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, * back to page at a time copies after we return 0. */ if (unlikely(copied < count)) { - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { iov_iter_revert(i, copied); copied = 0; } @@ -82,54 +79,44 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, write_bytes -= copied; total_copied += copied; offset += copied; - if (offset == PAGE_SIZE) { - pg++; - offset = 0; - } } return total_copied; } /* - * unlocks pages after btrfs_file_write is done with them + * Unlock folio after btrfs_file_write() is done with it. */ -static void btrfs_drop_pages(struct btrfs_fs_info *fs_info, - struct page **pages, size_t num_pages, +static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio, u64 pos, u64 copied) { - size_t i; u64 block_start = round_down(pos, fs_info->sectorsize); u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start; ASSERT(block_len <= U32_MAX); - for (i = 0; i < num_pages; i++) { - /* page checked is some magic around finding pages that - * have been modified without going through btrfs_set_page_dirty - * clear it here. There should be no need to mark the pages - * accessed as prepare_pages should have marked them accessed - * in prepare_pages via find_or_create_page() - */ - btrfs_folio_clamp_clear_checked(fs_info, page_folio(pages[i]), - block_start, block_len); - unlock_page(pages[i]); - put_page(pages[i]); - } + /* + * Folio checked is some magic around finding folios that have been + * modified without going through btrfs_dirty_folio(). Clear it here. + * There should be no need to mark the pages accessed as + * prepare_one_folio() should have marked them accessed in + * prepare_one_folio() via find_or_create_page() + */ + btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len); + folio_unlock(folio); + folio_put(folio); } /* * After btrfs_copy_from_user(), update the following things for delalloc: - * - Mark newly dirtied pages as DELALLOC in the io tree. + * - Mark newly dirtied folio as DELALLOC in the io tree. * Used to advise which range is to be written back. - * - Mark modified pages as Uptodate/Dirty and not needing COW fixup + * - Mark modified folio as Uptodate/Dirty and not needing COW fixup * - Update inode size for past EOF write */ -int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, - size_t num_pages, loff_t pos, size_t write_bytes, - struct extent_state **cached, bool noreserve) +int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos, + size_t write_bytes, struct extent_state **cached, bool noreserve) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; - int i; u64 num_bytes; u64 start_pos; u64 end_of_last_block; @@ -147,6 +134,8 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, num_bytes = round_up(write_bytes + pos - start_pos, fs_info->sectorsize); ASSERT(num_bytes <= U32_MAX); + ASSERT(folio_pos(folio) <= pos && + folio_pos(folio) + folio_size(folio) >= pos + write_bytes); end_of_last_block = start_pos + num_bytes - 1; @@ -163,16 +152,9 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, if (ret) return ret; - for (i = 0; i < num_pages; i++) { - struct page *p = pages[i]; - - btrfs_folio_clamp_set_uptodate(fs_info, page_folio(p), - start_pos, num_bytes); - btrfs_folio_clamp_clear_checked(fs_info, page_folio(p), - start_pos, num_bytes); - btrfs_folio_clamp_set_dirty(fs_info, page_folio(p), - start_pos, num_bytes); - } + btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes); + btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes); + btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes); /* * we've only changed i_size in ram, and we haven't updated @@ -851,53 +833,47 @@ out: } /* - * on error we return an unlocked page and the error value - * on success we return a locked page and 0 + * On error return an unlocked folio and the error value + * On success return a locked folio and 0 */ -static int prepare_uptodate_page(struct inode *inode, - struct page *page, u64 pos, - bool force_uptodate) +static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos, + u64 len, bool force_uptodate) { - struct folio *folio = page_folio(page); + u64 clamp_start = max_t(u64, pos, folio_pos(folio)); + u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio)); int ret = 0; - if (((pos & (PAGE_SIZE - 1)) || force_uptodate) && - !PageUptodate(page)) { - ret = btrfs_read_folio(NULL, folio); - if (ret) - return ret; - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - return -EIO; - } - - /* - * Since btrfs_read_folio() will unlock the folio before it - * returns, there is a window where btrfs_release_folio() can be - * called to release the page. Here we check both inode - * mapping and PagePrivate() to make sure the page was not - * released. - * - * The private flag check is essential for subpage as we need - * to store extra bitmap using folio private. - */ - if (page->mapping != inode->i_mapping || !folio_test_private(folio)) { - unlock_page(page); - return -EAGAIN; - } - } - return 0; -} + if (folio_test_uptodate(folio)) + return 0; -static fgf_t get_prepare_fgp_flags(bool nowait) -{ - fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT; + if (!force_uptodate && + IS_ALIGNED(clamp_start, PAGE_SIZE) && + IS_ALIGNED(clamp_end, PAGE_SIZE)) + return 0; - if (nowait) - fgp_flags |= FGP_NOWAIT; + ret = btrfs_read_folio(NULL, folio); + if (ret) + return ret; + folio_lock(folio); + if (!folio_test_uptodate(folio)) { + folio_unlock(folio); + return -EIO; + } - return fgp_flags; + /* + * Since btrfs_read_folio() will unlock the folio before it returns, + * there is a window where btrfs_release_folio() can be called to + * release the page. Here we check both inode mapping and page + * private to make sure the page was not released. + * + * The private flag check is essential for subpage as we need to store + * extra bitmap using folio private. + */ + if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) { + folio_unlock(folio); + return -EAGAIN; + } + return 0; } static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) @@ -914,89 +890,67 @@ static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) } /* - * this just gets pages into the page cache and locks them down. + * Get folio into the page cache and lock it. */ -static noinline int prepare_pages(struct inode *inode, struct page **pages, - size_t num_pages, loff_t pos, - size_t write_bytes, bool force_uptodate, - bool nowait) +static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret, + loff_t pos, size_t write_bytes, + bool force_uptodate, bool nowait) { - int i; unsigned long index = pos >> PAGE_SHIFT; gfp_t mask = get_prepare_gfp_flags(inode, nowait); - fgf_t fgp_flags = get_prepare_fgp_flags(nowait); + fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN); + struct folio *folio; int ret = 0; - int faili; - for (i = 0; i < num_pages; i++) { again: - pages[i] = pagecache_get_page(inode->i_mapping, index + i, - fgp_flags, mask | __GFP_WRITE); - if (!pages[i]) { - faili = i - 1; - if (nowait) - ret = -EAGAIN; - else - ret = -ENOMEM; - goto fail; - } - - ret = set_page_extent_mapped(pages[i]); - if (ret < 0) { - faili = i; - goto fail; - } - - if (i == 0) - ret = prepare_uptodate_page(inode, pages[i], pos, - force_uptodate); - if (!ret && i == num_pages - 1) - ret = prepare_uptodate_page(inode, pages[i], - pos + write_bytes, false); - if (ret) { - put_page(pages[i]); - if (!nowait && ret == -EAGAIN) { - ret = 0; - goto again; - } - faili = i - 1; - goto fail; + folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask); + if (IS_ERR(folio)) { + if (nowait) + ret = -EAGAIN; + else + ret = PTR_ERR(folio); + return ret; + } + /* Only support page sized folio yet. */ + ASSERT(folio_order(folio) == 0); + ret = set_folio_extent_mapped(folio); + if (ret < 0) { + folio_unlock(folio); + folio_put(folio); + return ret; + } + ret = prepare_uptodate_folio(inode, folio, pos, write_bytes, force_uptodate); + if (ret) { + /* The folio is already unlocked. */ + folio_put(folio); + if (!nowait && ret == -EAGAIN) { + ret = 0; + goto again; } - wait_on_page_writeback(pages[i]); + return ret; } - + *folio_ret = folio; return 0; -fail: - while (faili >= 0) { - unlock_page(pages[faili]); - put_page(pages[faili]); - faili--; - } - return ret; - } /* - * This function locks the extent and properly waits for data=ordered extents - * to finish before allowing the pages to be modified if need. + * Locks the extent and properly waits for data=ordered extents to finish + * before allowing the folios to be modified if need. * - * The return value: + * Return: * 1 - the extent is locked * 0 - the extent is not locked, and everything is OK - * -EAGAIN - need re-prepare the pages - * the other < 0 number - Something wrong happens + * -EAGAIN - need to prepare the folios again */ static noinline int -lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, - size_t num_pages, loff_t pos, - size_t write_bytes, +lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, + loff_t pos, size_t write_bytes, u64 *lockstart, u64 *lockend, bool nowait, struct extent_state **cached_state) { struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 start_pos; u64 last_pos; - int i; int ret = 0; start_pos = round_down(pos, fs_info->sectorsize); @@ -1008,12 +962,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, if (nowait) { if (!try_lock_extent(&inode->io_tree, start_pos, last_pos, cached_state)) { - for (i = 0; i < num_pages; i++) { - unlock_page(pages[i]); - put_page(pages[i]); - pages[i] = NULL; - } - + folio_unlock(folio); + folio_put(folio); return -EAGAIN; } } else { @@ -1027,10 +977,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, ordered->file_offset <= last_pos) { unlock_extent(&inode->io_tree, start_pos, last_pos, cached_state); - for (i = 0; i < num_pages; i++) { - unlock_page(pages[i]); - put_page(pages[i]); - } + folio_unlock(folio); + folio_put(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); return -EAGAIN; @@ -1044,11 +992,10 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, } /* - * We should be called after prepare_pages() which should have locked + * We should be called after prepare_one_folio() which should have locked * all pages in the range. */ - for (i = 0; i < num_pages; i++) - WARN_ON(!PageLocked(pages[i])); + WARN_ON(!folio_test_locked(folio)); return ret; } @@ -1120,7 +1067,7 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode) btrfs_drew_write_unlock(&inode->root->snapshot_lock); } -int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count) +int btrfs_write_check(struct kiocb *iocb, size_t count) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); @@ -1175,20 +1122,17 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) loff_t pos; struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct page **pages = NULL; struct extent_changeset *data_reserved = NULL; u64 release_bytes = 0; u64 lockstart; u64 lockend; size_t num_written = 0; - int nrptrs; ssize_t ret; - bool only_release_metadata = false; - bool force_page_uptodate = false; loff_t old_isize = i_size_read(inode); unsigned int ilock_flags = 0; const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); + bool only_release_metadata = false; if (nowait) ilock_flags |= BTRFS_ILOCK_TRY; @@ -1201,38 +1145,26 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) if (ret <= 0) goto out; - ret = btrfs_write_check(iocb, i, ret); + ret = btrfs_write_check(iocb, ret); if (ret < 0) goto out; pos = iocb->ki_pos; - nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE), - PAGE_SIZE / (sizeof(struct page *))); - nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); - nrptrs = max(nrptrs, 8); - pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL); - if (!pages) { - ret = -ENOMEM; - goto out; - } - while (iov_iter_count(i) > 0) { struct extent_state *cached_state = NULL; size_t offset = offset_in_page(pos); size_t sector_offset; - size_t write_bytes = min(iov_iter_count(i), - nrptrs * (size_t)PAGE_SIZE - - offset); - size_t num_pages; + size_t write_bytes = min(iov_iter_count(i), PAGE_SIZE - offset); size_t reserve_bytes; - size_t dirty_pages; size_t copied; size_t dirty_sectors; size_t num_sectors; + struct folio *folio = NULL; int extents_locked; + bool force_page_uptodate = false; /* - * Fault pages before locking them in prepare_pages + * Fault pages before locking them in prepare_one_folio() * to avoid recursive lock */ if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) { @@ -1271,8 +1203,6 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) only_release_metadata = true; } - num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE); - WARN_ON(num_pages > nrptrs); reserve_bytes = round_up(write_bytes + sector_offset, fs_info->sectorsize); WARN_ON(reserve_bytes == 0); @@ -1300,23 +1230,17 @@ again: break; } - /* - * This is going to setup the pages array with the number of - * pages we want, so we don't really need to worry about the - * contents of pages from loop to loop - */ - ret = prepare_pages(inode, pages, num_pages, - pos, write_bytes, force_page_uptodate, false); + ret = prepare_one_folio(inode, &folio, pos, write_bytes, + force_page_uptodate, false); if (ret) { btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); break; } - extents_locked = lock_and_cleanup_extent_if_need( - BTRFS_I(inode), pages, - num_pages, pos, write_bytes, &lockstart, - &lockend, nowait, &cached_state); + extents_locked = lock_and_cleanup_extent_if_need(BTRFS_I(inode), + folio, pos, write_bytes, &lockstart, + &lockend, nowait, &cached_state); if (extents_locked < 0) { if (!nowait && extents_locked == -EAGAIN) goto again; @@ -1327,28 +1251,18 @@ again: break; } - copied = btrfs_copy_from_user(pos, write_bytes, pages, i); + copied = btrfs_copy_from_user(pos, write_bytes, folio, i); num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes); dirty_sectors = round_up(copied + sector_offset, fs_info->sectorsize); dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors); - /* - * if we have trouble faulting in the pages, fall - * back to one page at a time - */ - if (copied < write_bytes) - nrptrs = 1; - if (copied == 0) { force_page_uptodate = true; dirty_sectors = 0; - dirty_pages = 0; } else { force_page_uptodate = false; - dirty_pages = DIV_ROUND_UP(copied + offset, - PAGE_SIZE); } if (num_sectors > dirty_sectors) { @@ -1358,13 +1272,10 @@ again: btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes, true); } else { - u64 __pos; - - __pos = round_down(pos, - fs_info->sectorsize) + - (dirty_pages << PAGE_SHIFT); + u64 release_start = round_up(pos + copied, + fs_info->sectorsize); btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, __pos, + data_reserved, release_start, release_bytes, true); } } @@ -1372,15 +1283,14 @@ again: release_bytes = round_up(copied + sector_offset, fs_info->sectorsize); - ret = btrfs_dirty_pages(BTRFS_I(inode), pages, - dirty_pages, pos, copied, + ret = btrfs_dirty_folio(BTRFS_I(inode), folio, pos, copied, &cached_state, only_release_metadata); /* * If we have not locked the extent range, because the range's * start offset is >= i_size, we might still have a non-NULL * cached extent state, acquired while marking the extent range - * as delalloc through btrfs_dirty_pages(). Therefore free any + * as delalloc through btrfs_dirty_page(). Therefore free any * possible cached extent state to avoid a memory leak. */ if (extents_locked) @@ -1391,7 +1301,7 @@ again: btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); if (ret) { - btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); + btrfs_drop_folio(fs_info, folio, pos, copied); break; } @@ -1399,7 +1309,7 @@ again: if (only_release_metadata) btrfs_check_nocow_unlock(BTRFS_I(inode)); - btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); + btrfs_drop_folio(fs_info, folio, pos, copied); cond_resched(); @@ -1407,8 +1317,6 @@ again: num_written += copied; } - kfree(pages); - if (release_bytes) { if (only_release_metadata) { btrfs_check_nocow_unlock(BTRFS_I(inode)); @@ -1453,7 +1361,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, if (ret || encoded->len == 0) goto out; - ret = btrfs_write_check(iocb, from, encoded->len); + ret = btrfs_write_check(iocb, encoded->len); if (ret < 0) goto out; @@ -3785,6 +3693,7 @@ const struct file_operations btrfs_file_operations = { .compat_ioctl = btrfs_compat_ioctl, #endif .remap_file_range = btrfs_remap_file_range, + .uring_cmd = btrfs_uring_cmd, .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC, }; diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h index 912254e653cf..de89e644be29 100644 --- a/fs/btrfs/file.h +++ b/fs/btrfs/file.h @@ -34,9 +34,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded); int btrfs_release_file(struct inode *inode, struct file *file); -int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, - size_t num_pages, loff_t pos, size_t write_bytes, - struct extent_state **cached, bool noreserve); +int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos, + size_t write_bytes, struct extent_state **cached, bool noreserve); int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end); int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, size_t *write_bytes, bool nowait); @@ -44,7 +43,7 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode); bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state, u64 *delalloc_start_ret, u64 *delalloc_end_ret); -int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count); +int btrfs_write_check(struct kiocb *iocb, size_t count); ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i); #endif diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f4bcb2530660..cfa52ef40b06 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -11,6 +11,7 @@ #include <linux/ratelimit.h> #include <linux/error-injection.h> #include <linux/sched/mm.h> +#include <linux/string_choices.h> #include "ctree.h" #include "fs.h" #include "messages.h" @@ -1387,6 +1388,7 @@ static int __btrfs_write_out_cache(struct inode *inode, int bitmaps = 0; int ret; int must_iput = 0; + int i_size; if (!i_size_read(inode)) return -EIO; @@ -1457,11 +1459,16 @@ static int __btrfs_write_out_cache(struct inode *inode, io_ctl_zero_remaining_pages(io_ctl); /* Everything is written out, now we dirty the pages in the file. */ - ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, - io_ctl->num_pages, 0, i_size_read(inode), - &cached_state, false); - if (ret) - goto out_nospc; + i_size = i_size_read(inode); + for (int i = 0; i < round_up(i_size, PAGE_SIZE) / PAGE_SIZE; i++) { + u64 dirty_start = i * PAGE_SIZE; + u64 dirty_len = min_t(u64, dirty_start + PAGE_SIZE, i_size) - dirty_start; + + ret = btrfs_dirty_folio(BTRFS_I(inode), page_folio(io_ctl->pages[i]), + dirty_start, dirty_len, &cached_state, false); + if (ret < 0) + goto out_nospc; + } if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem); @@ -2936,12 +2943,11 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group, if (info->bytes >= bytes && !block_group->ro) count++; btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s", - info->offset, info->bytes, - (info->bitmap) ? "yes" : "no"); + info->offset, info->bytes, str_yes_no(info->bitmap)); } spin_unlock(&ctl->tree_lock); btrfs_info(fs_info, "block group has cluster?: %s", - list_empty(&block_group->cluster_list) ? "no" : "yes"); + str_no_yes(list_empty(&block_group->cluster_list))); btrfs_info(fs_info, "%d free space entries at or bigger than %llu bytes", count, bytes); diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 79f64e383edd..79a1a3d6f04d 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -263,10 +263,10 @@ enum { BTRFS_FEATURE_INCOMPAT_ZONED | \ BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA) -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL /* * Features under developmen like Extent tree v2 support is enabled - * only under CONFIG_BTRFS_DEBUG. + * only under CONFIG_BTRFS_EXPERIMENTAL */ #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \ @@ -317,6 +317,8 @@ struct btrfs_dev_replace { struct percpu_counter bio_counter; wait_queue_head_t replace_wait; + + struct task_struct *replace_task; }; /* @@ -633,9 +635,10 @@ struct btrfs_fs_info { s32 delalloc_batch; struct percpu_counter evictable_extent_maps; - spinlock_t extent_map_shrinker_lock; - u64 extent_map_shrinker_last_root; - u64 extent_map_shrinker_last_ino; + u64 em_shrinker_last_root; + u64 em_shrinker_last_ino; + atomic64_t em_shrinker_nr_to_scan; + struct work_struct em_shrinker_work; /* Protected by 'trans_lock'. */ struct list_head dirty_cowonly_roots; @@ -876,12 +879,9 @@ struct btrfs_fs_info { #endif }; -#define page_to_inode(_page) (BTRFS_I(_Generic((_page), \ - struct page *: (_page))->mapping->host)) #define folio_to_inode(_folio) (BTRFS_I(_Generic((_folio), \ struct folio *: (_folio))->mapping->host)) -#define page_to_fs_info(_page) (page_to_inode(_page)->root->fs_info) #define folio_to_fs_info(_folio) (folio_to_inode(_folio)->root->fs_info) #define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \ diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e41bc7842577..03fe0de2cd0d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -421,7 +421,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, index++; continue; } - folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0); + folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); index++; if (IS_ERR(folio)) continue; @@ -556,8 +556,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, } else { struct folio *folio; - folio = __filemap_get_folio(inode->vfs_inode.i_mapping, - 0, 0, 0); + folio = filemap_get_folio(inode->vfs_inode.i_mapping, 0); ASSERT(!IS_ERR(folio)); btrfs_set_file_extent_compression(leaf, ei, 0); kaddr = kmap_local_folio(folio, 0); @@ -646,7 +645,7 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode, * If being used directly, you must have already checked we're allowed to cow * the range by getting true from can_cow_file_range_inline(). */ -static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offset, +static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 size, size_t compressed_size, int compress_type, struct folio *compressed_folio, @@ -736,7 +735,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, return 1; lock_extent(&inode->io_tree, offset, end, &cached); - ret = __cow_file_range_inline(inode, offset, size, compressed_size, + ret = __cow_file_range_inline(inode, size, compressed_size, compress_type, compressed_folio, update_i_size); if (ret > 0) { @@ -832,32 +831,16 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, return 0; } /* - * Special check for subpage. + * Only enable sector perfect compression for experimental builds. * - * We lock the full page then run each delalloc range in the page, thus - * for the following case, we will hit some subpage specific corner case: + * This is a big feature change for subpage cases, and can hit + * different corner cases, so only limit this feature for + * experimental build for now. * - * 0 32K 64K - * | |///////| |///////| - * \- A \- B - * - * In above case, both range A and range B will try to unlock the full - * page [0, 64K), causing the one finished later will have page - * unlocked already, triggering various page lock requirement BUG_ON()s. - * - * So here we add an artificial limit that subpage compression can only - * if the range is fully page aligned. - * - * In theory we only need to ensure the first page is fully covered, but - * the tailing partial page will be locked until the full compression - * finishes, delaying the write of other range. - * - * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range - * first to prevent any submitted async extent to unlock the full page. - * By this, we can ensure for subpage case that only the last async_cow - * will unlock the full page. + * ETA for moving this out of experimental builds is 6.15. */ - if (fs_info->sectorsize < PAGE_SIZE) { + if (fs_info->sectorsize < PAGE_SIZE && + !IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) { if (!PAGE_ALIGNED(start) || !PAGE_ALIGNED(end + 1)) return 0; @@ -896,13 +879,14 @@ static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 e for (unsigned long index = start >> PAGE_SHIFT; index <= end_index; index++) { - folio = __filemap_get_folio(inode->i_mapping, index, 0, 0); + folio = filemap_get_folio(inode->i_mapping, index); if (IS_ERR(folio)) { if (!ret) ret = PTR_ERR(folio); continue; } - folio_clear_dirty_for_io(folio); + btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start, + end + 1 - start); folio_put(folio); } return ret; @@ -1001,17 +985,6 @@ again: (start > 0 || end + 1 < inode->disk_i_size)) goto cleanup_and_bail_uncompressed; - /* - * For subpage case, we require full page alignment for the sector - * aligned range. - * Thus we must also check against @actual_end, not just @end. - */ - if (blocksize < PAGE_SIZE) { - if (!PAGE_ALIGNED(start) || - !PAGE_ALIGNED(round_up(actual_end, blocksize))) - goto cleanup_and_bail_uncompressed; - } - total_compressed = min_t(unsigned long, total_compressed, BTRFS_MAX_UNCOMPRESSED); total_in = 0; @@ -1359,7 +1332,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, u64 alloc_hint = 0; u64 orig_start = start; u64 num_bytes; - unsigned long ram_size; u64 cur_alloc_size = 0; u64 min_alloc_size; u64 blocksize = fs_info->sectorsize; @@ -1367,7 +1339,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, struct extent_map *em; unsigned clear_bits; unsigned long page_ops; - bool extent_reserved = false; int ret = 0; if (btrfs_is_free_space_inode(inode)) { @@ -1421,8 +1392,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, struct btrfs_ordered_extent *ordered; struct btrfs_file_extent file_extent; - cur_alloc_size = num_bytes; - ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, + ret = btrfs_reserve_extent(root, num_bytes, num_bytes, min_alloc_size, 0, alloc_hint, &ins, 1, 1); if (ret == -EAGAIN) { @@ -1453,9 +1423,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret < 0) goto out_unlock; cur_alloc_size = ins.offset; - extent_reserved = true; - ram_size = ins.offset; file_extent.disk_bytenr = ins.objectid; file_extent.disk_num_bytes = ins.offset; file_extent.num_bytes = ins.offset; @@ -1463,14 +1431,14 @@ static noinline int cow_file_range(struct btrfs_inode *inode, file_extent.offset = 0; file_extent.compression = BTRFS_COMPRESS_NONE; - lock_extent(&inode->io_tree, start, start + ram_size - 1, + lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, &cached); em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_REGULAR); if (IS_ERR(em)) { unlock_extent(&inode->io_tree, start, - start + ram_size - 1, &cached); + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(em); goto out_reserve; } @@ -1480,7 +1448,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, 1 << BTRFS_ORDERED_REGULAR); if (IS_ERR(ordered)) { unlock_extent(&inode->io_tree, start, - start + ram_size - 1, &cached); + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(ordered); goto out_drop_extent_cache; } @@ -1501,7 +1469,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, */ if (ret) btrfs_drop_extent_map_range(inode, start, - start + ram_size - 1, + start + cur_alloc_size - 1, false); } btrfs_put_ordered_extent(ordered); @@ -1519,7 +1487,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, page_ops = (keep_locked ? 0 : PAGE_UNLOCK); page_ops |= PAGE_SET_ORDERED; - extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, + extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); @@ -1529,7 +1497,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; - extent_reserved = false; + cur_alloc_size = 0; /* * btrfs_reloc_clone_csums() error, since start is increased @@ -1545,7 +1513,7 @@ done: return ret; out_drop_extent_cache: - btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false); + btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); @@ -1599,13 +1567,12 @@ out_unlock: * to decrement again the data space_info's bytes_may_use counter, * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. */ - if (extent_reserved) { + if (cur_alloc_size) { extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, locked_folio, &cached, clear_bits, page_ops); btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); - start += cur_alloc_size; } /* @@ -1614,11 +1581,13 @@ out_unlock: * space_info's bytes_may_use counter, reserved in * btrfs_check_data_free_space(). */ - if (start < end) { + if (start + cur_alloc_size < end) { clear_bits |= EXTENT_CLEAR_DATA_RESV; - extent_clear_unlock_delalloc(inode, start, end, locked_folio, + extent_clear_unlock_delalloc(inode, start + cur_alloc_size, + end, locked_folio, &cached, clear_bits, page_ops); - btrfs_qgroup_free_data(inode, NULL, start, end - start + 1, NULL); + btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size, + end - start - cur_alloc_size + 1, NULL); } return ret; } @@ -3094,34 +3063,6 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } - if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { - BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ - - btrfs_inode_safe_disk_i_size_write(inode, 0); - if (freespace_inode) - trans = btrfs_join_transaction_spacecache(root); - else - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - trans->block_rsv = &inode->block_rsv; - ret = btrfs_update_inode_fallback(trans, inode); - if (ret) /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, ret); - - ret = btrfs_insert_raid_extent(trans, ordered_extent); - if (ret) - btrfs_abort_transaction(trans, ret); - - goto out; - } - - clear_bits |= EXTENT_LOCKED; - lock_extent(io_tree, start, end, &cached_state); - if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); else @@ -3135,8 +3076,31 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) trans->block_rsv = &inode->block_rsv; ret = btrfs_insert_raid_extent(trans, ordered_extent); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { + /* Logic error */ + ASSERT(list_empty(&ordered_extent->list)); + if (!list_empty(&ordered_extent->list)) { + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } + + btrfs_inode_safe_disk_i_size_write(inode, 0); + ret = btrfs_update_inode_fallback(trans, inode); + if (ret) { + /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, ret); + } goto out; + } + + clear_bits |= EXTENT_LOCKED; + lock_extent(io_tree, start, end, &cached_state); if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compress_type = ordered_extent->compress_type; @@ -3791,14 +3755,45 @@ static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) return 0; } +static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) +{ + struct btrfs_root *root = inode->root; + struct btrfs_inode *existing; + const u64 ino = btrfs_ino(inode); + int ret; + + if (inode_unhashed(&inode->vfs_inode)) + return 0; + + if (prealloc) { + ret = xa_reserve(&root->inodes, ino, GFP_NOFS); + if (ret) + return ret; + } + + existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC); + + if (xa_is_err(existing)) { + ret = xa_err(existing); + ASSERT(ret != -EINVAL); + ASSERT(ret != -ENOMEM); + return ret; + } else if (existing) { + WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); + } + + return 0; +} + /* - * read an inode from the btree into the in-memory inode + * Read a locked inode from the btree into the in-memory inode and add it to + * its root list/tree. + * + * On failure clean up the inode. */ -static int btrfs_read_locked_inode(struct inode *inode, - struct btrfs_path *in_path) +static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_path *path = in_path; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -3812,25 +3807,25 @@ static int btrfs_read_locked_inode(struct inode *inode, ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); if (ret) - return ret; + goto out; ret = btrfs_fill_inode(inode, &rdev); if (!ret) filled = true; - if (!path) { - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - } + ASSERT(path); btrfs_get_inode_key(BTRFS_I(inode), &location); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { - if (path != in_path) - btrfs_free_path(path); - return ret; + /* + * ret > 0 can come from btrfs_search_slot called by + * btrfs_lookup_inode(), this means the inode was not found. + */ + if (ret > 0) + ret = -ENOENT; + goto out; } leaf = path->nodes[0]; @@ -3965,8 +3960,6 @@ cache_acl: btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret); } - if (path != in_path) - btrfs_free_path(path); if (!maybe_acls) cache_no_acl(inode); @@ -3993,7 +3986,15 @@ cache_acl: } btrfs_sync_inode_flags_to_i_flags(inode); + + ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); + if (ret) + goto out; + return 0; +out: + iget_failed(inode); + return ret; } /* @@ -5502,35 +5503,7 @@ out: return err; } -static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) -{ - struct btrfs_root *root = inode->root; - struct btrfs_inode *existing; - const u64 ino = btrfs_ino(inode); - int ret; - - if (inode_unhashed(&inode->vfs_inode)) - return 0; - if (prealloc) { - ret = xa_reserve(&root->inodes, ino, GFP_NOFS); - if (ret) - return ret; - } - - existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC); - - if (xa_is_err(existing)) { - ret = xa_err(existing); - ASSERT(ret != -EINVAL); - ASSERT(ret != -ENOMEM); - return ret; - } else if (existing) { - WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); - } - - return 0; -} static void btrfs_del_inode_from_root(struct btrfs_inode *inode) { @@ -5592,10 +5565,8 @@ static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) } /* - * Get an inode object given its inode number and corresponding root. - * Path can be preallocated to prevent recursing back to iget through - * allocator. NULL is also valid but may require an additional allocation - * later. + * Get an inode object given its inode number and corresponding root. Path is + * preallocated to prevent recursing back to iget through allocator. */ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, struct btrfs_path *path) @@ -5611,30 +5582,40 @@ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, return inode; ret = btrfs_read_locked_inode(inode, path); - /* - * ret > 0 can come from btrfs_search_slot called by - * btrfs_read_locked_inode(), this means the inode item was not found. - */ - if (ret > 0) - ret = -ENOENT; - if (ret < 0) - goto error; - - ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); - if (ret < 0) - goto error; + if (ret) + return ERR_PTR(ret); unlock_new_inode(inode); - return inode; -error: - iget_failed(inode); - return ERR_PTR(ret); } +/* + * Get an inode object given its inode number and corresponding root. + */ struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) { - return btrfs_iget_path(ino, root, NULL); + struct inode *inode; + struct btrfs_path *path; + int ret; + + inode = btrfs_iget_locked(ino, root); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (!(inode->i_state & I_NEW)) + return inode; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + + ret = btrfs_read_locked_inode(inode, path); + btrfs_free_path(path); + if (ret) + return ERR_PTR(ret); + + unlock_new_inode(inode); + return inode; } static struct inode *new_simple_dir(struct inode *dir, @@ -6023,7 +6004,7 @@ again: * offset. This means that new entries created during readdir * are *guaranteed* to be seen in the future by that readdir. * This has broken buggy programs which operate on names as - * they're returned by readdir. Until we re-use freed offsets + * they're returned by readdir. Until we reuse freed offsets * we have this hack to stop new entries from being returned * under the assumption that they'll never reach this huge * offset. @@ -6765,8 +6746,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, return ret; } -static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, - struct folio *folio) +static int read_inline_extent(struct btrfs_path *path, struct folio *folio) { struct btrfs_file_extent_item *fi; void *kaddr; @@ -6964,7 +6944,7 @@ next: ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE); ASSERT(em->len == fs_info->sectorsize); - ret = read_inline_extent(inode, path, folio); + ret = read_inline_extent(path, folio); if (ret < 0) goto out; goto insert; @@ -8972,28 +8952,6 @@ out_inode: return finish_open_simple(file, ret); } -void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; - struct folio *folio; - u32 len; - - ASSERT(end + 1 - start <= U32_MAX); - len = end + 1 - start; - while (index <= end_index) { - folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0); - ASSERT(!IS_ERR(folio)); /* folios should be in the extent_io_tree */ - - /* This is for data, which doesn't yet support larger folio. */ - ASSERT(folio_order(folio) == 0); - btrfs_folio_set_writeback(fs_info, folio, start, len); - folio_put(folio); - index++; - } -} - int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type) { @@ -9038,12 +8996,16 @@ static ssize_t btrfs_encoded_read_inline( unsigned long ptr; void *tmp; ssize_t ret; + const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } + + path->nowait = nowait; + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), extent_start, 0); if (ret) { @@ -9107,6 +9069,7 @@ out: struct btrfs_encoded_read_private { wait_queue_head_t wait; + void *uring_ctx; atomic_t pending; blk_status_t status; }; @@ -9126,26 +9089,40 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) */ WRITE_ONCE(priv->status, bbio->bio.bi_status); } - if (!atomic_dec_return(&priv->pending)) - wake_up(&priv->wait); + if (atomic_dec_return(&priv->pending) == 0) { + int err = blk_status_to_errno(READ_ONCE(priv->status)); + + if (priv->uring_ctx) { + btrfs_uring_read_extent_endio(priv->uring_ctx, err); + kfree(priv); + } else { + wake_up(&priv->wait); + } + } bio_put(&bbio->bio); } int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - u64 file_offset, u64 disk_bytenr, - u64 disk_io_size, struct page **pages) + u64 disk_bytenr, u64 disk_io_size, + struct page **pages, void *uring_ctx) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_encoded_read_private priv = { - .pending = ATOMIC_INIT(1), - }; + struct btrfs_encoded_read_private *priv; unsigned long i = 0; struct btrfs_bio *bbio; + int ret; + + priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS); + if (!priv) + return -ENOMEM; - init_waitqueue_head(&priv.wait); + init_waitqueue_head(&priv->wait); + atomic_set(&priv->pending, 1); + priv->status = 0; + priv->uring_ctx = uring_ctx; bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, - btrfs_encoded_read_endio, &priv); + btrfs_encoded_read_endio, priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->inode = inode; @@ -9153,11 +9130,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { - atomic_inc(&priv.pending); + atomic_inc(&priv->pending); btrfs_submit_bbio(bbio, 0); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, - btrfs_encoded_read_endio, &priv); + btrfs_encoded_read_endio, priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->inode = inode; continue; @@ -9168,22 +9145,33 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, disk_io_size -= bytes; } while (disk_io_size); - atomic_inc(&priv.pending); + atomic_inc(&priv->pending); btrfs_submit_bbio(bbio, 0); - if (atomic_dec_return(&priv.pending)) - io_wait_event(priv.wait, !atomic_read(&priv.pending)); - /* See btrfs_encoded_read_endio() for ordering. */ - return blk_status_to_errno(READ_ONCE(priv.status)); + if (uring_ctx) { + if (atomic_dec_return(&priv->pending) == 0) { + ret = blk_status_to_errno(READ_ONCE(priv->status)); + btrfs_uring_read_extent_endio(uring_ctx, ret); + kfree(priv); + return ret; + } + + return -EIOCBQUEUED; + } else { + if (atomic_dec_return(&priv->pending) != 0) + io_wait_event(priv->wait, !atomic_read(&priv->pending)); + /* See btrfs_encoded_read_endio() for ordering. */ + ret = blk_status_to_errno(READ_ONCE(priv->status)); + kfree(priv); + return ret; + } } -static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, - struct iov_iter *iter, - u64 start, u64 lockend, - struct extent_state **cached_state, - u64 disk_bytenr, u64 disk_io_size, - size_t count, bool compressed, - bool *unlocked) +ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, + u64 start, u64 lockend, + struct extent_state **cached_state, + u64 disk_bytenr, u64 disk_io_size, + size_t count, bool compressed, bool *unlocked) { struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct extent_io_tree *io_tree = &inode->io_tree; @@ -9203,8 +9191,8 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, goto out; } - ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, - disk_io_size, pages); + ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, + disk_io_size, pages, NULL); if (ret) goto out; @@ -9244,21 +9232,26 @@ out: } ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, - struct btrfs_ioctl_encoded_io_args *encoded) + struct btrfs_ioctl_encoded_io_args *encoded, + struct extent_state **cached_state, + u64 *disk_bytenr, u64 *disk_io_size) { struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; ssize_t ret; size_t count = iov_iter_count(iter); - u64 start, lockend, disk_bytenr, disk_io_size; - struct extent_state *cached_state = NULL; + u64 start, lockend; struct extent_map *em; + const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); bool unlocked = false; file_accessed(iocb->ki_filp); - btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); + ret = btrfs_inode_lock(inode, + BTRFS_ILOCK_SHARED | (nowait ? BTRFS_ILOCK_TRY : 0)); + if (ret) + return ret; if (iocb->ki_pos >= inode->vfs_inode.i_size) { btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); @@ -9271,21 +9264,46 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, */ lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; - for (;;) { + if (nowait) { struct btrfs_ordered_extent *ordered; - ret = btrfs_wait_ordered_range(inode, start, - lockend - start + 1); - if (ret) + if (filemap_range_needs_writeback(inode->vfs_inode.i_mapping, + start, lockend)) { + ret = -EAGAIN; + goto out_unlock_inode; + } + + if (!try_lock_extent(io_tree, start, lockend, cached_state)) { + ret = -EAGAIN; goto out_unlock_inode; - lock_extent(io_tree, start, lockend, &cached_state); + } + ordered = btrfs_lookup_ordered_range(inode, start, lockend - start + 1); - if (!ordered) - break; - btrfs_put_ordered_extent(ordered); - unlock_extent(io_tree, start, lockend, &cached_state); - cond_resched(); + if (ordered) { + btrfs_put_ordered_extent(ordered); + unlock_extent(io_tree, start, lockend, cached_state); + ret = -EAGAIN; + goto out_unlock_inode; + } + } else { + for (;;) { + struct btrfs_ordered_extent *ordered; + + ret = btrfs_wait_ordered_range(inode, start, + lockend - start + 1); + if (ret) + goto out_unlock_inode; + + lock_extent(io_tree, start, lockend, cached_state); + ordered = btrfs_lookup_ordered_range(inode, start, + lockend - start + 1); + if (!ordered) + break; + btrfs_put_ordered_extent(ordered); + unlock_extent(io_tree, start, lockend, cached_state); + cond_resched(); + } } em = btrfs_get_extent(inode, NULL, start, lockend - start + 1); @@ -9304,9 +9322,9 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, free_extent_map(em); em = NULL; ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, - &cached_state, extent_start, + cached_state, extent_start, count, encoded, &unlocked); - goto out; + goto out_unlock_extent; } /* @@ -9317,12 +9335,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, inode->vfs_inode.i_size) - iocb->ki_pos; if (em->disk_bytenr == EXTENT_MAP_HOLE || (em->flags & EXTENT_FLAG_PREALLOC)) { - disk_bytenr = EXTENT_MAP_HOLE; + *disk_bytenr = EXTENT_MAP_HOLE; count = min_t(u64, count, encoded->len); encoded->len = count; encoded->unencoded_len = count; } else if (extent_map_is_compressed(em)) { - disk_bytenr = em->disk_bytenr; + *disk_bytenr = em->disk_bytenr; /* * Bail if the buffer isn't large enough to return the whole * compressed extent. @@ -9331,7 +9349,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, ret = -ENOBUFS; goto out_em; } - disk_io_size = em->disk_num_bytes; + *disk_io_size = em->disk_num_bytes; count = em->disk_num_bytes; encoded->unencoded_len = em->ram_bytes; encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset); @@ -9341,47 +9359,42 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, goto out_em; encoded->compression = ret; } else { - disk_bytenr = extent_map_block_start(em) + (start - em->start); + *disk_bytenr = extent_map_block_start(em) + (start - em->start); if (encoded->len > count) encoded->len = count; /* * Don't read beyond what we locked. This also limits the page * allocations that we'll do. */ - disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; - count = start + disk_io_size - iocb->ki_pos; + *disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; + count = start + *disk_io_size - iocb->ki_pos; encoded->len = count; encoded->unencoded_len = count; - disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize); + *disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize); } free_extent_map(em); em = NULL; - if (disk_bytenr == EXTENT_MAP_HOLE) { - unlock_extent(io_tree, start, lockend, &cached_state); + if (*disk_bytenr == EXTENT_MAP_HOLE) { + unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); unlocked = true; ret = iov_iter_zero(count, iter); if (ret != count) ret = -EFAULT; } else { - ret = btrfs_encoded_read_regular(iocb, iter, start, lockend, - &cached_state, disk_bytenr, - disk_io_size, count, - encoded->compression, - &unlocked); + ret = -EIOCBQUEUED; + goto out_unlock_extent; } -out: - if (ret >= 0) - iocb->ki_pos += encoded->len; out_em: free_extent_map(em); out_unlock_extent: - if (!unlocked) - unlock_extent(io_tree, start, lockend, &cached_state); + /* Leave inode and extent locked if we need to do a read. */ + if (!unlocked && ret != -EIOCBQUEUED) + unlock_extent(io_tree, start, lockend, cached_state); out_unlock_inode: - if (!unlocked) + if (!unlocked && ret != -EIOCBQUEUED) btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return ret; } @@ -9492,7 +9505,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, */ disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); - folios = kvcalloc(nr_folios, sizeof(struct page *), GFP_KERNEL_ACCOUNT); + folios = kvcalloc(nr_folios, sizeof(struct folio *), GFP_KERNEL_ACCOUNT); if (!folios) return -ENOMEM; for (i = 0; i < nr_folios; i++) { @@ -9556,7 +9569,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, if (encoded->unencoded_len == encoded->len && encoded->unencoded_offset == 0 && can_cow_file_range_inline(inode, start, encoded->len, orig_count)) { - ret = __cow_file_range_inline(inode, start, encoded->len, + ret = __cow_file_range_inline(inode, encoded->len, orig_count, compression, folios[0], true); if (ret <= 0) { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index adb591b1d071..c9302d193187 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -29,6 +29,7 @@ #include <linux/fileattr.h> #include <linux/fsverity.h> #include <linux/sched/xacct.h> +#include <linux/io_uring/cmd.h> #include "ctree.h" #include "disk-io.h" #include "export.h" @@ -1048,7 +1049,6 @@ static noinline int btrfs_mksnapshot(const struct path *parent, struct btrfs_qgroup_inherit *inherit) { int ret; - bool snapshot_force_cow = false; /* * Force new buffered writes to reserve space even when NOCOW is @@ -1067,15 +1067,13 @@ static noinline int btrfs_mksnapshot(const struct path *parent, * creation. */ atomic_inc(&root->snapshot_force_cow); - snapshot_force_cow = true; btrfs_wait_ordered_extents(root, U64_MAX, NULL); ret = btrfs_mksubvol(parent, idmap, name, namelen, root, readonly, inherit); + atomic_dec(&root->snapshot_force_cow); out: - if (snapshot_force_cow) - atomic_dec(&root->snapshot_force_cow); btrfs_drew_read_unlock(&root->snapshot_lock); return ret; } @@ -4057,8 +4055,7 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info, return 0; } -static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info, - void __user *arg) +static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4513,12 +4510,17 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); size_t copy_end; + struct btrfs_inode *inode = BTRFS_I(file_inode(file)); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; loff_t pos; struct kiocb kiocb; ssize_t ret; + u64 disk_bytenr, disk_io_size; + struct extent_state *cached_state = NULL; if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; @@ -4571,7 +4573,32 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos; - ret = btrfs_encoded_read(&kiocb, &iter, &args); + ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state, + &disk_bytenr, &disk_io_size); + + if (ret == -EIOCBQUEUED) { + bool unlocked = false; + u64 start, lockend, count; + + start = ALIGN_DOWN(kiocb.ki_pos, fs_info->sectorsize); + lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; + + if (args.compression) + count = disk_io_size; + else + count = args.len; + + ret = btrfs_encoded_read_regular(&kiocb, &iter, start, lockend, + &cached_state, disk_bytenr, + disk_io_size, count, + args.compression, &unlocked); + + if (!unlocked) { + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + } + } + if (ret >= 0) { fsnotify_access(file); if (copy_to_user(argp + copy_end, @@ -4689,6 +4716,439 @@ out_acct: return ret; } +/* + * Context that's attached to an encoded read io_uring command, in cmd->pdu. It + * contains the fields in btrfs_uring_read_extent that are necessary to finish + * off and cleanup the I/O in btrfs_uring_read_finished. + */ +struct btrfs_uring_priv { + struct io_uring_cmd *cmd; + struct page **pages; + unsigned long nr_pages; + struct kiocb iocb; + struct iovec *iov; + struct iov_iter iter; + struct extent_state *cached_state; + u64 count; + u64 start; + u64 lockend; + int err; + bool compressed; +}; + +struct io_btrfs_cmd { + struct btrfs_uring_priv *priv; +}; + +static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); + struct btrfs_uring_priv *priv = bc->priv; + struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp)); + struct extent_io_tree *io_tree = &inode->io_tree; + unsigned long index; + u64 cur; + size_t page_offset; + ssize_t ret; + + if (priv->err) { + ret = priv->err; + goto out; + } + + if (priv->compressed) { + index = 0; + page_offset = 0; + } else { + index = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT; + page_offset = offset_in_page(priv->iocb.ki_pos - priv->start); + } + cur = 0; + while (cur < priv->count) { + size_t bytes = min_t(size_t, priv->count - cur, PAGE_SIZE - page_offset); + + if (copy_page_to_iter(priv->pages[index], page_offset, bytes, + &priv->iter) != bytes) { + ret = -EFAULT; + goto out; + } + + index++; + cur += bytes; + page_offset = 0; + } + ret = priv->count; + +out: + unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + + io_uring_cmd_done(cmd, ret, 0, issue_flags); + add_rchar(current, ret); + + for (index = 0; index < priv->nr_pages; index++) + __free_page(priv->pages[index]); + + kfree(priv->pages); + kfree(priv->iov); + kfree(priv); +} + +void btrfs_uring_read_extent_endio(void *ctx, int err) +{ + struct btrfs_uring_priv *priv = ctx; + struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(priv->cmd, struct io_btrfs_cmd); + + priv->err = err; + bc->priv = priv; + + io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished); +} + +static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter, + u64 start, u64 lockend, + struct extent_state *cached_state, + u64 disk_bytenr, u64 disk_io_size, + size_t count, bool compressed, + struct iovec *iov, struct io_uring_cmd *cmd) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct extent_io_tree *io_tree = &inode->io_tree; + struct page **pages; + struct btrfs_uring_priv *priv = NULL; + unsigned long nr_pages; + int ret; + + nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE); + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; + ret = btrfs_alloc_page_array(nr_pages, pages, 0); + if (ret) { + ret = -ENOMEM; + goto out_fail; + } + + priv = kmalloc(sizeof(*priv), GFP_NOFS); + if (!priv) { + ret = -ENOMEM; + goto out_fail; + } + + priv->iocb = *iocb; + priv->iov = iov; + priv->iter = *iter; + priv->count = count; + priv->cmd = cmd; + priv->cached_state = cached_state; + priv->compressed = compressed; + priv->nr_pages = nr_pages; + priv->pages = pages; + priv->start = start; + priv->lockend = lockend; + priv->err = 0; + + ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, + disk_io_size, pages, priv); + if (ret && ret != -EIOCBQUEUED) + goto out_fail; + + /* + * If we return -EIOCBQUEUED, we're deferring the cleanup to + * btrfs_uring_read_finished(), which will handle unlocking the extent + * and inode and freeing the allocations. + */ + + return -EIOCBQUEUED; + +out_fail: + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + kfree(priv); + return ret; +} + +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); + size_t copy_end; + struct btrfs_ioctl_encoded_io_args args = { 0 }; + int ret; + u64 disk_bytenr, disk_io_size; + struct file *file; + struct btrfs_inode *inode; + struct btrfs_fs_info *fs_info; + struct extent_io_tree *io_tree; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + loff_t pos; + struct kiocb kiocb; + struct extent_state *cached_state = NULL; + u64 start, lockend; + void __user *sqe_addr; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_acct; + } + file = cmd->file; + inode = BTRFS_I(file->f_inode); + fs_info = inode->root->fs_info; + io_tree = &inode->io_tree; + sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); + + if (issue_flags & IO_URING_F_COMPAT) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags); + if (copy_from_user(&args32, sqe_addr, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + args.iov = compat_ptr(args32.iov); + args.iovcnt = args32.iovcnt; + args.offset = args32.offset; + args.flags = args32.flags; +#else + return -ENOTTY; +#endif + } else { + copy_end = copy_end_kernel; + if (copy_from_user(&args, sqe_addr, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + } + + if (args.flags != 0) + return -EINVAL; + + ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), + &iov, &iter); + if (ret < 0) + goto out_acct; + + if (iov_iter_count(&iter) == 0) { + ret = 0; + goto out_free; + } + + pos = args.offset; + ret = rw_verify_area(READ, file, &pos, args.len); + if (ret < 0) + goto out_free; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = pos; + + if (issue_flags & IO_URING_F_NONBLOCK) + kiocb.ki_flags |= IOCB_NOWAIT; + + start = ALIGN_DOWN(pos, fs_info->sectorsize); + lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; + + ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state, + &disk_bytenr, &disk_io_size); + if (ret < 0 && ret != -EIOCBQUEUED) + goto out_free; + + file_accessed(file); + + if (copy_to_user(sqe_addr + copy_end, (const char *)&args + copy_end_kernel, + sizeof(args) - copy_end_kernel)) { + if (ret == -EIOCBQUEUED) { + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + } + ret = -EFAULT; + goto out_free; + } + + if (ret == -EIOCBQUEUED) { + u64 count; + + /* + * If we've optimized things by storing the iovecs on the stack, + * undo this. + */ + if (!iov) { + iov = kmalloc(sizeof(struct iovec) * args.iovcnt, GFP_NOFS); + if (!iov) { + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + ret = -ENOMEM; + goto out_acct; + } + + memcpy(iov, iovstack, sizeof(struct iovec) * args.iovcnt); + } + + count = min_t(u64, iov_iter_count(&iter), disk_io_size); + + /* Match ioctl by not returning past EOF if uncompressed. */ + if (!args.compression) + count = min_t(u64, count, args.len); + + ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend, + cached_state, disk_bytenr, + disk_io_size, count, + args.compression, iov, cmd); + + goto out_acct; + } + +out_free: + kfree(iov); + +out_acct: + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + + return ret; +} + +int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + switch (cmd->cmd_op) { + case BTRFS_IOC_ENCODED_READ: +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + case BTRFS_IOC_ENCODED_READ_32: +#endif + return btrfs_uring_encoded_read(cmd, issue_flags); + } + + return -EINVAL; +} + +static int btrfs_ioctl_subvol_sync(struct btrfs_fs_info *fs_info, void __user *argp) +{ + struct btrfs_root *root; + struct btrfs_ioctl_subvol_wait args = { 0 }; + signed long sched_ret; + int refs; + u64 root_flags; + bool wait_for_deletion = false; + bool found = false; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + switch (args.mode) { + case BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED: + /* + * Wait for the first one deleted that waits until all previous + * are cleaned. + */ + spin_lock(&fs_info->trans_lock); + if (!list_empty(&fs_info->dead_roots)) { + root = list_last_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + args.subvolid = btrfs_root_id(root); + found = true; + } + spin_unlock(&fs_info->trans_lock); + if (!found) + return -ENOENT; + + fallthrough; + case BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE: + if ((0 < args.subvolid && args.subvolid < BTRFS_FIRST_FREE_OBJECTID) || + BTRFS_LAST_FREE_OBJECTID < args.subvolid) + return -EINVAL; + break; + case BTRFS_SUBVOL_SYNC_COUNT: + spin_lock(&fs_info->trans_lock); + args.count = list_count_nodes(&fs_info->dead_roots); + spin_unlock(&fs_info->trans_lock); + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + case BTRFS_SUBVOL_SYNC_PEEK_FIRST: + spin_lock(&fs_info->trans_lock); + /* Last in the list was deleted first. */ + if (!list_empty(&fs_info->dead_roots)) { + root = list_last_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + args.subvolid = btrfs_root_id(root); + } else { + args.subvolid = 0; + } + spin_unlock(&fs_info->trans_lock); + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + case BTRFS_SUBVOL_SYNC_PEEK_LAST: + spin_lock(&fs_info->trans_lock); + /* First in the list was deleted last. */ + if (!list_empty(&fs_info->dead_roots)) { + root = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + args.subvolid = btrfs_root_id(root); + } else { + args.subvolid = 0; + } + spin_unlock(&fs_info->trans_lock); + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + default: + return -EINVAL; + } + + /* 32bit limitation: fs_roots_radix key is not wide enough. */ + if (sizeof(unsigned long) != sizeof(u64) && args.subvolid > U32_MAX) + return -EOVERFLOW; + + while (1) { + /* Wait for the specific one. */ + if (down_read_interruptible(&fs_info->subvol_sem) == -EINTR) + return -EINTR; + refs = -1; + spin_lock(&fs_info->fs_roots_radix_lock); + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)args.subvolid); + if (root) { + spin_lock(&root->root_item_lock); + refs = btrfs_root_refs(&root->root_item); + root_flags = btrfs_root_flags(&root->root_item); + spin_unlock(&root->root_item_lock); + } + spin_unlock(&fs_info->fs_roots_radix_lock); + up_read(&fs_info->subvol_sem); + + /* Subvolume does not exist. */ + if (!root) + return -ENOENT; + + /* Subvolume not deleted at all. */ + if (refs > 0) + return -EEXIST; + /* We've waited and now the subvolume is gone. */ + if (wait_for_deletion && refs == -1) { + /* Return the one we waited for as the last one. */ + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + } + + /* Subvolume not found on the first try (deleted or never existed). */ + if (refs == -1) + return -ENOENT; + + wait_for_deletion = true; + ASSERT(root_flags & BTRFS_ROOT_SUBVOL_DEAD); + sched_ret = schedule_timeout_interruptible(HZ); + /* Early wake up or error. */ + if (sched_ret != 0) + return -EINTR; + } + + return 0; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -4811,7 +5271,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_QUOTA_RESCAN_STATUS: return btrfs_ioctl_quota_rescan_status(fs_info, argp); case BTRFS_IOC_QUOTA_RESCAN_WAIT: - return btrfs_ioctl_quota_rescan_wait(fs_info, argp); + return btrfs_ioctl_quota_rescan_wait(fs_info); case BTRFS_IOC_DEV_REPLACE: return btrfs_ioctl_dev_replace(fs_info, argp); case BTRFS_IOC_GET_SUPPORTED_FEATURES: @@ -4840,6 +5300,8 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_ENCODED_WRITE_32: return btrfs_ioctl_encoded_write(file, argp, true); #endif + case BTRFS_IOC_SUBVOL_SYNC_WAIT: + return btrfs_ioctl_subvol_sync(fs_info, argp); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 19cd26b0244a..2b760c8778f8 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -22,5 +22,7 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); int __pure btrfs_is_empty_uuid(const u8 *uuid); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); +int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); +void btrfs_uring_read_extent_endio(void *ctx, int err); #endif diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 6a0b7abb5bd9..9a7a7b723305 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -162,21 +162,6 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) } /* - * Try-lock for write. - * - * Return 1 if the rwlock has been taken, 0 otherwise - */ -int btrfs_try_tree_write_lock(struct extent_buffer *eb) -{ - if (down_write_trylock(&eb->lock)) { - btrfs_set_eb_lock_owner(eb, current->pid); - trace_btrfs_try_tree_write_lock(eb); - return 1; - } - return 0; -} - -/* * Release read lock. */ void btrfs_tree_read_unlock(struct extent_buffer *eb) diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 3c15c75e0582..46c8be2afab1 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -180,7 +180,6 @@ static inline void btrfs_tree_read_lock(struct extent_buffer *eb) void btrfs_tree_read_unlock(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); -int btrfs_try_tree_write_lock(struct extent_buffer *eb); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 72856f6775f7..a45bc11f8665 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -80,7 +80,7 @@ void lzo_free_workspace(struct list_head *ws) kfree(workspace); } -struct list_head *lzo_alloc_workspace(unsigned int level) +struct list_head *lzo_alloc_workspace(void) { struct workspace *workspace; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index a0e8deca87a7..a6f92836c9b1 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -226,8 +226,7 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, return qgroup; } -static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup *qgroup) +static void __del_qgroup_rb(struct btrfs_qgroup *qgroup) { struct btrfs_qgroup_list *list; @@ -258,7 +257,7 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) return -ENOENT; rb_erase(&qgroup->node, &fs_info->qgroup_tree); - __del_qgroup_rb(fs_info, qgroup); + __del_qgroup_rb(qgroup); return 0; } @@ -469,7 +468,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) /* * If a qgroup exists for a subvolume ID, it is possible * that subvolume has been deleted, in which case - * re-using that ID would lead to incorrect accounting. + * reusing that ID would lead to incorrect accounting. * * Ensure that we skip any such subvol ids. * @@ -643,7 +642,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) while ((n = rb_first(&fs_info->qgroup_tree))) { qgroup = rb_entry(n, struct btrfs_qgroup, node); rb_erase(n, &fs_info->qgroup_tree); - __del_qgroup_rb(fs_info, qgroup); + __del_qgroup_rb(qgroup); btrfs_sysfs_del_one_qgroup(fs_info, qgroup); kfree(qgroup); } @@ -2001,27 +2000,27 @@ out: * Return <0 for insertion failure, caller can free @record safely. */ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record) + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record, + u64 bytenr) { struct btrfs_qgroup_extent_record *existing, *ret; - const unsigned long index = (record->bytenr >> fs_info->sectorsize_bits); + const unsigned long index = (bytenr >> fs_info->sectorsize_bits); if (!btrfs_qgroup_full_accounting(fs_info)) return 1; #if BITS_PER_LONG == 32 - if (record->bytenr >= MAX_LFS_FILESIZE) { + if (bytenr >= MAX_LFS_FILESIZE) { btrfs_err_rl(fs_info, "qgroup record for extent at %llu is beyond 32bit page cache and xarray index limit", - record->bytenr); + bytenr); btrfs_err_32bit_limit(fs_info); return -EOVERFLOW; } #endif - lockdep_assert_held(&delayed_refs->lock); - trace_btrfs_qgroup_trace_extent(fs_info, record); + trace_btrfs_qgroup_trace_extent(fs_info, record, bytenr); xa_lock(&delayed_refs->dirty_extents); existing = xa_load(&delayed_refs->dirty_extents, index); @@ -2066,12 +2065,17 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, * transaction committing, but not now as qgroup accounting will be wrong again. */ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, - struct btrfs_qgroup_extent_record *qrecord) + struct btrfs_qgroup_extent_record *qrecord, + u64 bytenr) { - struct btrfs_backref_walk_ctx ctx = { 0 }; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_backref_walk_ctx ctx = { + .bytenr = bytenr, + .fs_info = fs_info, + }; int ret; - if (!btrfs_qgroup_full_accounting(trans->fs_info)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; /* * We are always called in a context where we are already holding a @@ -2094,16 +2098,13 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, */ ASSERT(trans != NULL); - if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) + if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) return 0; - ctx.bytenr = qrecord->bytenr; - ctx.fs_info = trans->fs_info; - ret = btrfs_find_all_roots(&ctx, true); if (ret < 0) { - qgroup_mark_inconsistent(trans->fs_info); - btrfs_warn(trans->fs_info, + qgroup_mark_inconsistent(fs_info); + btrfs_warn(fs_info, "error accounting new delayed refs extent (err code: %d), quota inconsistent", ret); return 0; @@ -2138,7 +2139,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup_extent_record *record; - struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs; const unsigned long index = (bytenr >> fs_info->sectorsize_bits); int ret; @@ -2148,26 +2149,21 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, if (!record) return -ENOMEM; - if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, index, GFP_NOFS)) { + if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) { kfree(record); return -ENOMEM; } - delayed_refs = &trans->transaction->delayed_refs; - record->bytenr = bytenr; record->num_bytes = num_bytes; - record->old_roots = NULL; - spin_lock(&delayed_refs->lock); - ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); - spin_unlock(&delayed_refs->lock); + ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record, bytenr); if (ret) { /* Clean up if insertion fails or item exists. */ xa_release(&delayed_refs->dirty_extents, index); kfree(record); return 0; } - return btrfs_qgroup_trace_extent_post(trans, record); + return btrfs_qgroup_trace_extent_post(trans, record, bytenr); } /* @@ -2652,7 +2648,6 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, if (!extent_buffer_uptodate(root_eb)) { struct btrfs_tree_parent_check check = { - .has_first_key = false, .transid = root_gen, .level = root_level }; @@ -3043,14 +3038,16 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) delayed_refs = &trans->transaction->delayed_refs; qgroup_to_skip = delayed_refs->qgroup_to_skip; xa_for_each(&delayed_refs->dirty_extents, index, record) { + const u64 bytenr = (((u64)index) << fs_info->sectorsize_bits); + num_dirty_extents++; - trace_btrfs_qgroup_account_extents(fs_info, record); + trace_btrfs_qgroup_account_extents(fs_info, record, bytenr); if (!ret && !(fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) { struct btrfs_backref_walk_ctx ctx = { 0 }; - ctx.bytenr = record->bytenr; + ctx.bytenr = bytenr; ctx.fs_info = fs_info; /* @@ -3092,7 +3089,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) ulist_del(record->old_roots, qgroup_to_skip, 0); } - ret = btrfs_qgroup_account_extent(trans, record->bytenr, + ret = btrfs_qgroup_account_extent(trans, bytenr, record->num_bytes, record->old_roots, new_roots); @@ -4196,13 +4193,20 @@ static int try_flush_qgroup(struct btrfs_root *root) return 0; } - btrfs_run_delayed_iputs(root->fs_info); - btrfs_wait_on_delayed_iputs(root->fs_info); ret = btrfs_start_delalloc_snapshot(root, true); if (ret < 0) goto out; btrfs_wait_ordered_extents(root, U64_MAX, NULL); + /* + * After waiting for ordered extents run delayed iputs in order to free + * space from unlinked files before committing the current transaction, + * as ordered extents may have been holding the last reference of an + * inode and they add a delayed iput when they complete. + */ + btrfs_run_delayed_iputs(root->fs_info); + btrfs_wait_on_delayed_iputs(root->fs_info); + ret = btrfs_commit_current_transaction(root); out: clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); @@ -4687,8 +4691,7 @@ out: * BOTH POINTERS ARE BEFORE TREE SWAP * @last_snapshot: last snapshot generation of the subvolume tree */ -int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, - struct btrfs_root *subvol_root, +int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, struct btrfs_block_group *bg, struct extent_buffer *subvol_parent, int subvol_slot, struct extent_buffer *reloc_parent, int reloc_slot, @@ -4894,17 +4897,6 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) xa_destroy(&trans->delayed_refs.dirty_extents); } -void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes) -{ - if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) - return; - - if (!is_fstree(root)) - return; - - btrfs_qgroup_free_refroot(fs_info, root, rsv_bytes, BTRFS_QGROUP_RSV_DATA); -} - int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, const struct btrfs_squota_delta *delta) { diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index c229256d6fd5..e233cc79af18 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -127,7 +127,12 @@ struct btrfs_inode; * Record a dirty extent, and info qgroup to update quota on it */ struct btrfs_qgroup_extent_record { - u64 bytenr; + /* + * The bytenr of the extent is given by its index in the dirty_extents + * xarray of struct btrfs_delayed_ref_root left shifted by + * fs_info->sectorsize_bits. + */ + u64 num_bytes; /* @@ -345,9 +350,11 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); int btrfs_qgroup_trace_extent_nolock( struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record); + struct btrfs_qgroup_extent_record *record, + u64 bytenr); int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, - struct btrfs_qgroup_extent_record *qrecord); + struct btrfs_qgroup_extent_record *qrecord, + u64 bytenr); int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes); int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, @@ -432,8 +439,7 @@ void btrfs_qgroup_init_swapped_blocks( struct btrfs_qgroup_swapped_blocks *swapped_blocks); void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root); -int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, - struct btrfs_root *subvol_root, +int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, struct btrfs_block_group *bg, struct extent_buffer *subvol_parent, int subvol_slot, struct extent_buffer *reloc_parent, int reloc_slot, @@ -442,7 +448,6 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb); void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans); bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info); -void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes); int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, const struct btrfs_squota_delta *delta); diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 4c859b550f6c..9ffc79f250fb 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -13,6 +13,39 @@ #include "volumes.h" #include "print-tree.h" +static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + const struct btrfs_key *oldkey, + u64 newlen, u64 frontpad) +{ + struct btrfs_stripe_extent *extent; + struct extent_buffer *leaf; + int slot; + size_t item_size; + struct btrfs_key newkey = { + .objectid = oldkey->objectid + frontpad, + .type = BTRFS_RAID_STRIPE_KEY, + .offset = newlen, + }; + + ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY); + + leaf = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size(leaf, slot); + extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); + + for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) { + struct btrfs_raid_stride *stride = &extent->strides[i]; + u64 phys; + + phys = btrfs_raid_stride_physical(leaf, stride); + btrfs_set_raid_stride_physical(leaf, stride, phys + frontpad); + } + + btrfs_set_item_key_safe(trans, path, &newkey); +} + int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -36,23 +69,24 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le while (1) { key.objectid = start; key.type = BTRFS_RAID_STRIPE_KEY; - key.offset = length; + key.offset = 0; ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1); if (ret < 0) break; - if (ret > 0) { - ret = 0; - if (path->slots[0] == 0) - break; + + if (path->slots[0] == btrfs_header_nritems(path->nodes[0])) path->slots[0]--; - } leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); found_start = key.objectid; found_end = found_start + key.offset; + ret = 0; + + if (key.type != BTRFS_RAID_STRIPE_KEY) + break; /* That stripe ends before we start, we're done. */ if (found_end <= start) @@ -61,7 +95,40 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le trace_btrfs_raid_extent_delete(fs_info, start, end, found_start, found_end); - ASSERT(found_start >= start && found_end <= end); + /* + * The stripe extent starts before the range we want to delete: + * + * |--- RAID Stripe Extent ---| + * |--- keep ---|--- drop ---| + * + * This means we have to duplicate the tree item, truncate the + * length to the new size and then re-insert the item. + */ + if (found_start < start) { + u64 diff = start - found_start; + + btrfs_partially_delete_raid_extent(trans, path, &key, + diff, 0); + break; + } + + /* + * The stripe extent ends after the range we want to delete: + * + * |--- RAID Stripe Extent ---| + * |--- drop ---|--- keep ---| + * + * This means we have to duplicate the tree item, truncate the + * length to the new size and then re-insert the item. + */ + if (found_end > end) { + u64 diff = found_end - end; + + btrfs_partially_delete_raid_extent(trans, path, &key, + diff, diff); + break; + } + ret = btrfs_del_item(trans, stripe_root, path); if (ret) break; @@ -108,8 +175,9 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans, return ret; } -static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, - struct btrfs_io_context *bioc) +EXPORT_FOR_TESTS +int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_key stripe_key; @@ -233,7 +301,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, found_end = found_logical + found_length; if (found_logical > end) { - ret = -ENOENT; + ret = -ENODATA; goto out; } @@ -279,10 +347,10 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, } /* If we're here, we haven't found the requested devid in the stripe. */ - ret = -ENOENT; + ret = -ENODATA; out: if (ret > 0) - ret = -ENOENT; + ret = -ENODATA; if (ret && ret != -EIO && !stripe->rst_search_commit_root) { btrfs_debug(fs_info, "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s", diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h index 1ac1c21aac2f..541836421778 100644 --- a/fs/btrfs/raid-stripe-tree.h +++ b/fs/btrfs/raid-stripe-tree.h @@ -28,6 +28,11 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_extent *ordered_extent); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_io_context *bioc); +#endif + static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info, u64 map_type) { diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 39bec672df0c..cdd373c27784 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1272,8 +1272,7 @@ static inline void bio_list_put(struct bio_list *bio_list) static void assert_rbio(struct btrfs_raid_bio *rbio) { - if (!IS_ENABLED(CONFIG_BTRFS_DEBUG) || - !IS_ENABLED(CONFIG_BTRFS_ASSERT)) + if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) return; /* diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f3834f8d26b4..bf267bdfa8f8 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1244,7 +1244,7 @@ again: * The real subtree rescan is delayed until we have new * CoW on the subtree root node before transaction commit. */ - ret = btrfs_qgroup_add_swapped_blocks(trans, dest, + ret = btrfs_qgroup_add_swapped_blocks(dest, rc->block_group, parent, slot, path->nodes[level], path->slots[level], last_snapshot); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 3a3427428074..204c928beaf9 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1656,8 +1656,7 @@ static u32 stripe_length(const struct scrub_stripe *stripe) stripe->bg->start + stripe->bg->length - stripe->logical); } -static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, - struct scrub_stripe *stripe) +static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; @@ -1704,8 +1703,18 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, &stripe_len, &bioc, &io_stripe, &mirror); btrfs_put_bioc(bioc); if (err < 0) { - set_bit(i, &stripe->io_error_bitmap); - set_bit(i, &stripe->error_bitmap); + if (err != -ENODATA) { + /* + * Earlier btrfs_get_raid_extent_offset() + * returned -ENODATA, which means there's + * no entry for the corresponding range + * in the stripe tree. But if it's in + * the extent tree, then it's a preallocated + * extent and not an error. + */ + set_bit(i, &stripe->io_error_bitmap); + set_bit(i, &stripe->error_bitmap); + } continue; } @@ -1743,7 +1752,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) { - scrub_submit_extent_sector_read(sctx, stripe); + scrub_submit_extent_sector_read(stripe); return; } @@ -1954,7 +1963,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, ASSERT(sctx->raid56_data_stripes); /* - * For data stripe search, we cannot re-use the same extent/csum paths, + * For data stripe search, we cannot reuse the same extent/csum paths, * as the data stripe bytenr may be smaller than previous extent. Thus * we have to use our own extent/csum paths. */ @@ -2103,7 +2112,6 @@ out: */ static int scrub_simple_mirror(struct scrub_ctx *sctx, struct btrfs_block_group *bg, - struct btrfs_chunk_map *map, u64 logical_start, u64 logical_length, struct btrfs_device *device, u64 physical, int mirror_num) @@ -2222,7 +2230,7 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx, * just RAID1, so we can reuse scrub_simple_mirror() to scrub * this stripe. */ - ret = scrub_simple_mirror(sctx, bg, map, cur_logical, + ret = scrub_simple_mirror(sctx, bg, cur_logical, BTRFS_STRIPE_LEN, device, cur_physical, mirror_num); if (ret) @@ -2256,7 +2264,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, /* Offset inside the chunk */ u64 offset; u64 stripe_logical; - int stop_loop = 0; /* Extent_path should be released by now. */ ASSERT(sctx->extent_path.nodes[0] == NULL); @@ -2307,7 +2314,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, * Only @physical and @mirror_num needs to calculated using * @stripe_index. */ - ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length, + ret = scrub_simple_mirror(sctx, bg, bg->start, bg->length, scrub_dev, map->stripes[stripe_index].physical, stripe_index + 1); offset = 0; @@ -2362,7 +2369,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, * We can reuse scrub_simple_mirror() here, as the repair part * is still based on @mirror_num. */ - ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN, + ret = scrub_simple_mirror(sctx, bg, logical, BTRFS_STRIPE_LEN, scrub_dev, physical, 1); if (ret < 0) goto out; @@ -2370,14 +2377,8 @@ next: logical += increment; physical += BTRFS_STRIPE_LEN; spin_lock(&sctx->stat_lock); - if (stop_loop) - sctx->stat.last_physical = - map->stripes[stripe_index].physical + dev_stripe_len; - else - sctx->stat.last_physical = physical; + sctx->stat.last_physical = physical; spin_unlock(&sctx->stat_lock); - if (stop_loop) - break; } out: ret2 = flush_scrub_stripes(sctx); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index b068469871f8..7254279c3cc9 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -980,9 +980,7 @@ static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) return ret; } -typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index, - struct fs_path *p, - void *ctx); +typedef int (*iterate_inode_ref_t)(u64 dir, struct fs_path *p, void *ctx); /* * Helper function to iterate the entries in ONE btrfs_inode_ref or @@ -1007,8 +1005,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, u32 name_len; char *start; int ret = 0; - int num = 0; - int index; u64 dir; unsigned long name_off; unsigned long elem_size; @@ -1043,13 +1039,11 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, iref = (struct btrfs_inode_ref *)(ptr + cur); name_len = btrfs_inode_ref_name_len(eb, iref); name_off = (unsigned long)(iref + 1); - index = btrfs_inode_ref_index(eb, iref); dir = found_key->offset; } else { extref = (struct btrfs_inode_extref *)(ptr + cur); name_len = btrfs_inode_extref_name_len(eb, extref); name_off = (unsigned long)&extref->name; - index = btrfs_inode_extref_index(eb, extref); dir = btrfs_inode_extref_parent(eb, extref); } @@ -1094,10 +1088,9 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, } cur += elem_size + name_len; - ret = iterate(num, dir, index, p, ctx); + ret = iterate(dir, p, ctx); if (ret) goto out; - num++; } out: @@ -1227,8 +1220,7 @@ out: return ret; } -static int __copy_first_ref(int num, u64 dir, int index, - struct fs_path *p, void *ctx) +static int __copy_first_ref(u64 dir, struct fs_path *p, void *ctx) { int ret; struct fs_path *pt = ctx; @@ -3768,7 +3760,6 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, struct recorded_ref *parent_ref, const bool is_orphan) { - struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key di_key; @@ -3797,7 +3788,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, goto out; } - di = btrfs_match_dir_item_name(fs_info, path, parent_ref->name, + di = btrfs_match_dir_item_name(path, parent_ref->name, parent_ref->name_len); if (!di) { ret = 0; @@ -4708,8 +4699,7 @@ out: return ret; } -static int record_new_ref_if_needed(int num, u64 dir, int index, - struct fs_path *name, void *ctx) +static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) { int ret = 0; struct send_ctx *sctx = ctx; @@ -4738,8 +4728,7 @@ out: return ret; } -static int record_deleted_ref_if_needed(int num, u64 dir, int index, - struct fs_path *name, void *ctx) +static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) { int ret = 0; struct send_ctx *sctx = ctx; @@ -5677,10 +5666,11 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, * Note that send_buf is a mapping of send_buf_pages, so this is really * reading into send_buf. */ - ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset, + ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), disk_bytenr, disk_num_bytes, sctx->send_buf_pages + - (data_offset >> PAGE_SHIFT)); + (data_offset >> PAGE_SHIFT), + NULL); if (ret) goto out; @@ -8135,7 +8125,20 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a * making it RW. This also protects against deletion. */ spin_lock(&send_root->root_item_lock); - if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) { + /* + * Unlikely but possible, if the subvolume is marked for deletion but + * is slow to remove the directory entry, send can still be started. + */ + if (btrfs_root_dead(send_root)) { + spin_unlock(&send_root->root_item_lock); + return -EPERM; + } + /* Userspace tools do the checks and warn the user if it's not RO. */ + if (!btrfs_root_readonly(send_root)) { + spin_unlock(&send_root->root_item_lock); + return -EPERM; + } + if (send_root->dedupe_in_progress) { dedupe_in_progress_warn(send_root); spin_unlock(&send_root->root_item_lock); return -EAGAIN; @@ -8144,15 +8147,6 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a spin_unlock(&send_root->root_item_lock); /* - * Userspace tools do the checks and warn the user if it's - * not RO. - */ - if (!btrfs_root_readonly(send_root)) { - ret = -EPERM; - goto out; - } - - /* * Check that we don't overflow at later allocations, we request * clone_sources_count + 1 items, and compare to unsigned long inside * access_ok. Also set an upper limit for allocation size so this can't @@ -8217,15 +8211,6 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a } sctx->send_root = send_root; - /* - * Unlikely but possible, if the subvolume is marked for deletion but - * is slow to remove the directory entry, send can still be started - */ - if (btrfs_root_dead(sctx->send_root)) { - ret = -EPERM; - goto out; - } - sctx->clone_roots_cnt = arg->clone_sources_count; if (sctx->proto >= 2) { diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index b07f4aa66878..9309886c5ea1 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -16,7 +16,7 @@ struct btrfs_ioctl_send_args; #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" /* Conditional support for the upcoming protocol version. */ -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL #define BTRFS_SEND_STREAM_VERSION 3 #else #define BTRFS_SEND_STREAM_VERSION 2 diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index d5a9cd8a4fd8..255e85f78313 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1279,7 +1279,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) * If we are freeing inodes, we want to make sure all delayed iputs have * completed, because they could have been on an inode with i_nlink == 0, and * thus have been truncated and freed up space. But again this space is not - * immediately re-usable, it comes in the form of a delayed ref, which must be + * immediately reusable, it comes in the form of a delayed ref, which must be * run and then the transaction must be committed. * * COMMIT_TRANS @@ -1488,8 +1488,7 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); } -static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +static void wait_reserve_ticket(struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { @@ -1547,7 +1546,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, case BTRFS_RESERVE_FLUSH_DATA: case BTRFS_RESERVE_FLUSH_ALL: case BTRFS_RESERVE_FLUSH_ALL_STEAL: - wait_reserve_ticket(fs_info, space_info, ticket); + wait_reserve_ticket(space_info, ticket); break; case BTRFS_RESERVE_FLUSH_LIMIT: priority_reclaim_metadata_space(fs_info, space_info, ticket, @@ -1984,8 +1983,7 @@ static bool is_reclaim_urgent(struct btrfs_space_info *space_info) return unalloc < data_chunk_size; } -static void do_reclaim_sweep(const struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, int raid) +static void do_reclaim_sweep(struct btrfs_space_info *space_info, int raid) { struct btrfs_block_group *bg; int thresh_pct; @@ -2081,6 +2079,6 @@ void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info) if (!btrfs_should_periodic_reclaim(space_info)) continue; for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) - do_reclaim_sweep(fs_info, space_info, raid); + do_reclaim_sweep(space_info, raid); } } diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index fe4d719d506b..8c68059ac1b0 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -140,12 +140,10 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, return ERR_PTR(-ENOMEM); spin_lock_init(&ret->lock); - if (type == BTRFS_SUBPAGE_METADATA) { + if (type == BTRFS_SUBPAGE_METADATA) atomic_set(&ret->eb_refs, 0); - } else { - atomic_set(&ret->readers, 0); - atomic_set(&ret->writers, 0); - } + else + atomic_set(&ret->nr_locked, 0); return ret; } @@ -221,62 +219,6 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, __start_bit; \ }) -void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) -{ - struct btrfs_subpage *subpage = folio_get_private(folio); - const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); - const int nbits = len >> fs_info->sectorsize_bits; - unsigned long flags; - - - btrfs_subpage_assert(fs_info, folio, start, len); - - spin_lock_irqsave(&subpage->lock, flags); - /* - * Even though it's just for reading the page, no one should have - * locked the subpage range. - */ - ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); - bitmap_set(subpage->bitmaps, start_bit, nbits); - atomic_add(nbits, &subpage->readers); - spin_unlock_irqrestore(&subpage->lock, flags); -} - -void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) -{ - struct btrfs_subpage *subpage = folio_get_private(folio); - const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); - const int nbits = len >> fs_info->sectorsize_bits; - unsigned long flags; - bool is_data; - bool last; - - btrfs_subpage_assert(fs_info, folio, start, len); - is_data = is_data_inode(BTRFS_I(folio->mapping->host)); - - spin_lock_irqsave(&subpage->lock, flags); - - /* The range should have already been locked. */ - ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits)); - ASSERT(atomic_read(&subpage->readers) >= nbits); - - bitmap_clear(subpage->bitmaps, start_bit, nbits); - last = atomic_sub_and_test(nbits, &subpage->readers); - - /* - * For data we need to unlock the page if the last read has finished. - * - * And please don't replace @last with atomic_sub_and_test() call - * inside if () condition. - * As we want the atomic_sub_and_test() to be always executed. - */ - if (is_data && last) - folio_unlock(folio); - spin_unlock_irqrestore(&subpage->lock, flags); -} - static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) { u64 orig_start = *start; @@ -295,28 +237,8 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) orig_start + orig_len) - *start; } -static void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) -{ - struct btrfs_subpage *subpage = folio_get_private(folio); - const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); - const int nbits = (len >> fs_info->sectorsize_bits); - unsigned long flags; - int ret; - - btrfs_subpage_assert(fs_info, folio, start, len); - - spin_lock_irqsave(&subpage->lock, flags); - ASSERT(atomic_read(&subpage->readers) == 0); - ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); - bitmap_set(subpage->bitmaps, start_bit, nbits); - ret = atomic_add_return(nbits, &subpage->writers); - ASSERT(ret == nbits); - spin_unlock_irqrestore(&subpage->lock, flags); -} - -static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) +static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage = folio_get_private(folio); const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); @@ -334,9 +256,9 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf * extent_clear_unlock_delalloc() for compression path. * * This @locked_page is locked by plain lock_page(), thus its - * subpage::writers is 0. Handle them in a special way. + * subpage::locked is 0. Handle them in a special way. */ - if (atomic_read(&subpage->writers) == 0) { + if (atomic_read(&subpage->nr_locked) == 0) { spin_unlock_irqrestore(&subpage->lock, flags); return true; } @@ -345,40 +267,13 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf clear_bit(bit, subpage->bitmaps); cleared++; } - ASSERT(atomic_read(&subpage->writers) >= cleared); - last = atomic_sub_and_test(cleared, &subpage->writers); + ASSERT(atomic_read(&subpage->nr_locked) >= cleared); + last = atomic_sub_and_test(cleared, &subpage->nr_locked); spin_unlock_irqrestore(&subpage->lock, flags); return last; } /* - * Lock a folio for delalloc page writeback. - * - * Return -EAGAIN if the page is not properly initialized. - * Return 0 with the page locked, and writer counter updated. - * - * Even with 0 returned, the page still need extra check to make sure - * it's really the correct page, as the caller is using - * filemap_get_folios_contig(), which can race with page invalidating. - */ -int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) -{ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { - folio_lock(folio); - return 0; - } - folio_lock(folio); - if (!folio_test_private(folio) || !folio_get_private(folio)) { - folio_unlock(folio); - return -EAGAIN; - } - btrfs_subpage_clamp_range(folio, &start, &len); - btrfs_subpage_start_writer(fs_info, folio, start, len); - return 0; -} - -/* * Handle different locked folios: * * - Non-subpage folio @@ -394,8 +289,8 @@ int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, * bitmap, reduce the writer lock number, and unlock the page if that's * the last locked range. */ -void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) +void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage = folio_get_private(folio); @@ -408,24 +303,24 @@ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, /* * For subpage case, there are two types of locked page. With or - * without writers number. + * without locked number. * - * Since we own the page lock, no one else could touch subpage::writers + * Since we own the page lock, no one else could touch subpage::locked * and we are safe to do several atomic operations without spinlock. */ - if (atomic_read(&subpage->writers) == 0) { - /* No writers, locked by plain lock_page(). */ + if (atomic_read(&subpage->nr_locked) == 0) { + /* No subpage lock, locked by plain lock_page(). */ folio_unlock(folio); return; } btrfs_subpage_clamp_range(folio, &start, &len); - if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len)) + if (btrfs_subpage_end_and_test_lock(fs_info, folio, start, len)) folio_unlock(folio); } -void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, - struct folio *folio, unsigned long bitmap) +void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, + struct folio *folio, unsigned long bitmap) { struct btrfs_subpage *subpage = folio_get_private(folio); const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked; @@ -434,13 +329,13 @@ void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, int cleared = 0; int bit; - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { + if (!btrfs_is_subpage(fs_info, folio->mapping)) { folio_unlock(folio); return; } - if (atomic_read(&subpage->writers) == 0) { - /* No writers, locked by plain lock_page(). */ + if (atomic_read(&subpage->nr_locked) == 0) { + /* No subpage lock, locked by plain lock_page(). */ folio_unlock(folio); return; } @@ -450,8 +345,8 @@ void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, if (test_and_clear_bit(bit + start_bit, subpage->bitmaps)) cleared++; } - ASSERT(atomic_read(&subpage->writers) >= cleared); - last = atomic_sub_and_test(cleared, &subpage->writers); + ASSERT(atomic_read(&subpage->nr_locked) >= cleared); + last = atomic_sub_and_test(cleared, &subpage->nr_locked); spin_unlock_irqrestore(&subpage->lock, flags); if (last) folio_unlock(folio); @@ -776,8 +671,8 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, * This populates the involved subpage ranges so that subpage helpers can * properly unlock them. */ -void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) +void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage; unsigned long flags; @@ -796,58 +691,11 @@ void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, /* Target range should not yet be locked. */ ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); bitmap_set(subpage->bitmaps, start_bit, nbits); - ret = atomic_add_return(nbits, &subpage->writers); + ret = atomic_add_return(nbits, &subpage->nr_locked); ASSERT(ret <= fs_info->sectors_per_page); spin_unlock_irqrestore(&subpage->lock, flags); } -/* - * Find any subpage writer locked range inside @folio, starting at file offset - * @search_start. The caller should ensure the folio is locked. - * - * Return true and update @found_start_ret and @found_len_ret to the first - * writer locked range. - * Return false if there is no writer locked range. - */ -bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 search_start, - u64 *found_start_ret, u32 *found_len_ret) -{ - struct btrfs_subpage *subpage = folio_get_private(folio); - const u32 sectors_per_page = fs_info->sectors_per_page; - const unsigned int len = PAGE_SIZE - offset_in_page(search_start); - const unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, - locked, search_start, len); - const unsigned int locked_bitmap_start = sectors_per_page * btrfs_bitmap_nr_locked; - const unsigned int locked_bitmap_end = locked_bitmap_start + sectors_per_page; - unsigned long flags; - int first_zero; - int first_set; - bool found = false; - - ASSERT(folio_test_locked(folio)); - spin_lock_irqsave(&subpage->lock, flags); - first_set = find_next_bit(subpage->bitmaps, locked_bitmap_end, start_bit); - if (first_set >= locked_bitmap_end) - goto out; - - found = true; - - *found_start_ret = folio_pos(folio) + - ((first_set - locked_bitmap_start) << fs_info->sectorsize_bits); - /* - * Since @first_set is ensured to be smaller than locked_bitmap_end - * here, @found_start_ret should be inside the folio. - */ - ASSERT(*found_start_ret < folio_pos(folio) + PAGE_SIZE); - - first_zero = find_next_zero_bit(subpage->bitmaps, locked_bitmap_end, first_set); - *found_len_ret = (first_zero - first_set) << fs_info->sectorsize_bits; -out: - spin_unlock_irqrestore(&subpage->lock, flags); - return found; -} - #define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ { \ const int sectors_per_page = fs_info->sectors_per_page; \ diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 4b85d91d0e18..428fa9389fd4 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -45,14 +45,6 @@ enum { struct btrfs_subpage { /* Common members for both data and metadata pages */ spinlock_t lock; - /* - * Both data and metadata needs to track how many readers are for the - * page. - * Data relies on @readers to unlock the page when last reader finished. - * While metadata doesn't need page unlock, it needs to prevent - * page::private get cleared before the last end_page_read(). - */ - atomic_t readers; union { /* * Structures only used by metadata @@ -62,8 +54,12 @@ struct btrfs_subpage { */ atomic_t eb_refs; - /* Structures only used by data */ - atomic_t writers; + /* + * Structures only used by data, + * + * How many sectors inside the page is locked. + */ + atomic_t nr_locked; }; unsigned long bitmaps[]; }; @@ -95,23 +91,12 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage); void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); -void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); -void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); - -int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); -void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); -void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); -void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, - struct folio *folio, unsigned long bitmap); -bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 search_start, - u64 *found_start_ret, u32 *found_len_ret); - +void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len); +void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len); +void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, + struct folio *folio, unsigned long bitmap); /* * Template for subpage related operations. * diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 881d62d81b9f..97a85d180b61 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -28,7 +28,6 @@ #include <linux/btrfs.h> #include <linux/security.h> #include <linux/fs_parser.h> -#include <linux/swap.h> #include "messages.h" #include "delayed-inode.h" #include "ctree.h" @@ -946,8 +945,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec } static int btrfs_fill_super(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - void *data) + struct btrfs_fs_devices *fs_devices) { struct inode *inode; struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -971,7 +969,7 @@ static int btrfs_fill_super(struct super_block *sb, return err; } - err = open_ctree(sb, fs_devices, (char *)data); + err = open_ctree(sb, fs_devices); if (err) { btrfs_err(fs_info, "open_ctree failed"); return err; @@ -1893,7 +1891,7 @@ static int btrfs_get_tree_super(struct fs_context *fc) snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id); btrfs_sb(sb)->bdev_holder = &btrfs_fs_type; - ret = btrfs_fill_super(sb, fs_devices, NULL); + ret = btrfs_fill_super(sb, fs_devices); } if (ret) { @@ -2257,7 +2255,10 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false); if (IS_ERR_OR_NULL(device)) { mutex_unlock(&uuid_mutex); - ret = PTR_ERR(device); + if (IS_ERR(device)) + ret = PTR_ERR(device); + else + ret = 0; break; } ret = !(device->fs_devices->num_devices == @@ -2396,13 +2397,7 @@ static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_contro trace_btrfs_extent_map_shrinker_count(fs_info, nr); - /* - * Only report the real number for DEBUG builds, as there are reports of - * serious performance degradation caused by too frequent shrinks. - */ - if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) - return nr; - return 0; + return nr; } static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc) @@ -2410,16 +2405,10 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan); struct btrfs_fs_info *fs_info = btrfs_sb(sb); - /* - * We may be called from any task trying to allocate memory and we don't - * want to slow it down with scanning and dropping extent maps. It would - * also cause heavy lock contention if many tasks concurrently enter - * here. Therefore only allow kswapd tasks to scan and drop extent maps. - */ - if (!current_is_kswapd()) - return 0; + btrfs_free_extent_maps(fs_info, nr_to_scan); - return btrfs_free_extent_maps(fs_info, nr_to_scan); + /* The extent map shrinker runs asynchronously, so always return 0. */ + return 0; } static const struct super_operations btrfs_super_ops = { diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 03926ad467c9..b843308e2bc6 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1390,7 +1390,7 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, btrfs_bg_reclaim_threshold_store); -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL static ssize_t btrfs_offload_csum_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { @@ -1450,7 +1450,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, bg_reclaim_threshold), BTRFS_ATTR_PTR(, commit_stats), BTRFS_ATTR_PTR(, temp_fsid), -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL BTRFS_ATTR_PTR(, offload_csum), #endif NULL, diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index ce50847e1e01..e607b5d52fb1 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -29,6 +29,7 @@ const char *test_error[] = { [TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group", [TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map", [TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map", + [TEST_ALLOC_IO_CONTEXT] = "cannot allocate io context", }; static const struct super_operations btrfs_test_super_ops = { @@ -291,6 +292,9 @@ int btrfs_run_sanity_tests(void) ret = btrfs_test_free_space_tree(sectorsize, nodesize); if (ret) goto out; + ret = btrfs_test_raid_stripe_tree(sectorsize, nodesize); + if (ret) + goto out; } } ret = btrfs_test_extent_map(); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index dc2f2ab15fa5..b524ecf2f452 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -24,6 +24,7 @@ enum { TEST_ALLOC_BLOCK_GROUP, TEST_ALLOC_EXTENT_MAP, TEST_ALLOC_CHUNK_MAP, + TEST_ALLOC_IO_CONTEXT, }; extern const char *test_error[]; @@ -37,6 +38,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize); int btrfs_test_inodes(u32 sectorsize, u32 nodesize); int btrfs_test_qgroups(u32 sectorsize, u32 nodesize); int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); +int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize); int btrfs_test_extent_map(void); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize); diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c new file mode 100644 index 000000000000..30f17eb7b6a8 --- /dev/null +++ b/fs/btrfs/tests/raid-stripe-tree-tests.c @@ -0,0 +1,538 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 Western Digital Corporation or its affiliates. + */ + +#include <linux/sizes.h> +#include "../fs.h" +#include "../disk-io.h" +#include "../transaction.h" +#include "../volumes.h" +#include "../raid-stripe-tree.h" +#include "btrfs-tests.h" + +#define RST_TEST_NUM_DEVICES (2) +#define RST_TEST_RAID1_TYPE (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_RAID1) + +typedef int (*test_func_t)(struct btrfs_trans_handle *trans); + +static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_devices, + u64 devid) +{ + struct btrfs_device *dev; + + list_for_each_entry(dev, &fs_devices->devices, dev_list) { + if (dev->devid == devid) + return dev; + } + + return NULL; +} + +/* + * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then + * delete the 1st 32K, making the new start address 1M+32K. + */ +static int test_front_delete(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical = SZ_1M; + u64 len = SZ_64K; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical, SZ_32K); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", logical, + logical + SZ_32K); + goto out; + } + + len = SZ_32K; + ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_32K, &len, + map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical + SZ_32K, logical + SZ_32K + len); + goto out; + } + + if (io_stripe.physical != logical + SZ_32K) { + test_err("invalid physical address, expected %llu, got %llu", + logical + SZ_32K, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_32K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_32K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (!ret) { + ret = -EINVAL; + test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", + logical, logical + SZ_32K); + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K); +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* + * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then + * truncate the stripe extent down to 32K. + */ +static int test_tail_delete(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical = SZ_1M; + u64 len = SZ_64K; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + if (!io_stripe.dev) { + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical + SZ_32K, logical + SZ_64K); + goto out; + } + + len = SZ_32K; + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu, got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_32K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_32K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical, len); + if (ret) + test_err("deleting RAID extent [%llu, %llu] failed", logical, + logical + len); + +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* + * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then + * overwrite the whole range giving it new physical address at an offset of 1G. + * The intent of this test is to exercise the 'update_raid_extent_item()' + * function called be btrfs_insert_one_raid_extent(). + */ +static int test_create_update_delete(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical = SZ_1M; + u64 len = SZ_64K; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + if (!io_stripe.dev) { + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = SZ_1G + logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("updating RAID extent failed: %d", ret); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical + SZ_1G) { + test_err("invalid physical address, expected %llu, got %llu", + logical + SZ_1G, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical, len); + if (ret) + test_err("deleting RAID extent [%llu, %llu] failed", logical, + logical + len); + +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* + * Test a simple 64K RST write on a 2 disk RAID1 at a logical address of 1M. + * The "physical" copy on device 0 is at 1M, on device 1 it is at 1G+1M. + */ +static int test_simple_create_delete(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical = SZ_1M; + u64 len = SZ_64K; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + bioc->map_type = map_type; + bioc->size = SZ_64K; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + if (!io_stripe.dev) { + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical, len); + if (ret) + test_err("deleting RAID extent [%llu, %llu] failed", logical, + logical + len); + +out: + btrfs_put_bioc(bioc); + return ret; +} + +static const test_func_t tests[] = { + test_simple_create_delete, + test_create_update_delete, + test_tail_delete, + test_front_delete, +}; + +static int run_test(test_func_t test, u32 sectorsize, u32 nodesize) +{ + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root = NULL; + int ret; + + fs_info = btrfs_alloc_dummy_fs_info(sectorsize, nodesize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + ret = -ENOMEM; + goto out; + } + + root = btrfs_alloc_dummy_root(fs_info); + if (IS_ERR(root)) { + test_std_err(TEST_ALLOC_ROOT); + ret = PTR_ERR(root); + goto out; + } + btrfs_set_super_compat_ro_flags(root->fs_info->super_copy, + BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE); + root->root_key.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + fs_info->stripe_root = root; + root->fs_info->tree_root = root; + + root->node = alloc_test_extent_buffer(root->fs_info, nodesize); + if (IS_ERR(root->node)) { + test_std_err(TEST_ALLOC_EXTENT_BUFFER); + ret = PTR_ERR(root->node); + goto out; + } + btrfs_set_header_level(root->node, 0); + btrfs_set_header_nritems(root->node, 0); + root->alloc_bytenr += 2 * nodesize; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_device *dev; + + dev = btrfs_alloc_dummy_device(fs_info); + if (IS_ERR(dev)) { + test_err("cannot allocate device"); + ret = PTR_ERR(dev); + goto out; + } + dev->devid = i; + } + + btrfs_init_dummy_trans(&trans, root->fs_info); + ret = test(&trans); + if (ret) + goto out; + +out: + btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); + + return ret; +} + +int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize) +{ + int ret = 0; + + test_msg("running raid-stripe-tree tests"); + for (int i = 0; i < ARRAY_SIZE(tests); i++) { + ret = run_test(tests[i], sectorsize, nodesize); + if (ret) { + test_err("test-case %ps failed with %d\n", tests[i], ret); + goto out; + } + } + +out: + return ret; +} diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0fc873af891f..dc0b837efd5d 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -141,8 +141,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) WARN_ON(refcount_read(&transaction->use_count) == 0); if (refcount_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); - WARN_ON(!RB_EMPTY_ROOT( - &transaction->delayed_refs.href_root.rb_root)); + WARN_ON(!xa_empty(&transaction->delayed_refs.head_refs)); WARN_ON(!xa_empty(&transaction->delayed_refs.dirty_extents)); if (transaction->delayed_refs.pending_csums) btrfs_err(transaction->fs_info, @@ -349,9 +348,8 @@ loop: memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs)); - cur_trans->delayed_refs.href_root = RB_ROOT_CACHED; + xa_init(&cur_trans->delayed_refs.head_refs); xa_init(&cur_trans->delayed_refs.dirty_extents); - atomic_set(&cur_trans->delayed_refs.num_entries, 0); /* * although the tree mod log is per file system and not per transaction, @@ -2052,7 +2050,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) spin_unlock(&fs_info->trans_lock); - btrfs_cleanup_one_transaction(trans->transaction, fs_info); + btrfs_cleanup_one_transaction(trans->transaction); spin_lock(&fs_info->trans_lock); if (cur_trans == fs_info->running_transaction) diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index dd9ce9b9f69e..184fa5c0062a 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -33,7 +33,7 @@ struct btrfs_path; */ #define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1) -/* Radix-tree tag for roots that are part of the trasaction. */ +/* Radix-tree tag for roots that are part of the transaction. */ #define BTRFS_ROOT_TRANS_TAG 0 enum btrfs_trans_state { diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 7b50263723bc..148d8cefa40e 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -2183,8 +2183,8 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) return 0; } -int btrfs_verify_level_key(struct extent_buffer *eb, int level, - struct btrfs_key *first_key, u64 parent_transid) +int btrfs_verify_level_key(struct extent_buffer *eb, + const struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; int found_level; @@ -2192,16 +2192,16 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, int ret; found_level = btrfs_header_level(eb); - if (found_level != level) { + if (found_level != check->level) { WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), KERN_ERR "BTRFS: tree level check failed\n"); btrfs_err(fs_info, "tree level mismatch detected, bytenr=%llu level expected=%u has=%u", - eb->start, level, found_level); + eb->start, check->level, found_level); return -EIO; } - if (!first_key) + if (!check->has_first_key) return 0; /* @@ -2226,15 +2226,15 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, btrfs_node_key_to_cpu(eb, &found_key, 0); else btrfs_item_key_to_cpu(eb, &found_key, 0); - ret = btrfs_comp_cpu_keys(first_key, &found_key); + ret = btrfs_comp_cpu_keys(&check->first_key, &found_key); if (ret) { WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), KERN_ERR "BTRFS: tree first key check failed\n"); btrfs_err(fs_info, "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", - eb->start, parent_transid, first_key->objectid, - first_key->type, first_key->offset, + eb->start, check->transid, check->first_key.objectid, + check->first_key.type, check->first_key.offset, found_key.objectid, found_key.type, found_key.offset); } diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index 01669cfa6578..db67f96cbe4b 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -69,7 +69,7 @@ int btrfs_check_node(struct extent_buffer *node); int btrfs_check_chunk_valid(struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 logical); int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner); -int btrfs_verify_level_key(struct extent_buffer *eb, int level, - struct btrfs_key *first_key, u64 parent_transid); +int btrfs_verify_level_key(struct extent_buffer *eb, + const struct btrfs_tree_parent_check *check); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9637c7cdc0cf..c8d6587688b3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -6204,7 +6204,6 @@ static int log_delayed_deletions_full(struct btrfs_trans_handle *trans, static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, - struct btrfs_log_ctx *ctx, const struct list_head *delayed_del_list, const struct btrfs_delayed_item *first, const struct btrfs_delayed_item **last_ret) @@ -6265,7 +6264,7 @@ static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans, if (ret < 0) { return ret; } else if (ret == 0) { - ret = batch_delete_dir_index_items(trans, inode, path, ctx, + ret = batch_delete_dir_index_items(trans, inode, path, delayed_del_list, curr, &last); if (ret) diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index b382a4c443d4..1ac2678fc4ca 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -909,7 +909,6 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, * is freed (its refcount is decremented). */ struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, struct extent_buffer *eb, u64 time_seq) { diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h index 6308c577a4a4..1c12566040db 100644 --- a/fs/btrfs/tree-mod-log.h +++ b/fs/btrfs/tree-mod-log.h @@ -41,7 +41,6 @@ int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op); int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb); struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, struct extent_buffer *eb, u64 time_seq); struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index eb51b609190f..1cccaf9c2b0d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -733,6 +733,114 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb) } /* + * We can have very weird soft links passed in. + * One example is "/proc/self/fd/<fd>", which can be a soft link to + * a block device. + * + * But it's never a good idea to use those weird names. + * Here we check if the path (not following symlinks) is a good one inside + * "/dev/". + */ +static bool is_good_dev_path(const char *dev_path) +{ + struct path path = { .mnt = NULL, .dentry = NULL }; + char *path_buf = NULL; + char *resolved_path; + bool is_good = false; + int ret; + + if (!dev_path) + goto out; + + path_buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!path_buf) + goto out; + + /* + * Do not follow soft link, just check if the original path is inside + * "/dev/". + */ + ret = kern_path(dev_path, 0, &path); + if (ret) + goto out; + resolved_path = d_path(&path, path_buf, PATH_MAX); + if (IS_ERR(resolved_path)) + goto out; + if (strncmp(resolved_path, "/dev/", strlen("/dev/"))) + goto out; + is_good = true; +out: + kfree(path_buf); + path_put(&path); + return is_good; +} + +static int get_canonical_dev_path(const char *dev_path, char *canonical) +{ + struct path path = { .mnt = NULL, .dentry = NULL }; + char *path_buf = NULL; + char *resolved_path; + int ret; + + if (!dev_path) { + ret = -EINVAL; + goto out; + } + + path_buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!path_buf) { + ret = -ENOMEM; + goto out; + } + + ret = kern_path(dev_path, LOOKUP_FOLLOW, &path); + if (ret) + goto out; + resolved_path = d_path(&path, path_buf, PATH_MAX); + ret = strscpy(canonical, resolved_path, PATH_MAX); +out: + kfree(path_buf); + path_put(&path); + return ret; +} + +static bool is_same_device(struct btrfs_device *device, const char *new_path) +{ + struct path old = { .mnt = NULL, .dentry = NULL }; + struct path new = { .mnt = NULL, .dentry = NULL }; + char *old_path = NULL; + bool is_same = false; + int ret; + + if (!device->name) + goto out; + + old_path = kzalloc(PATH_MAX, GFP_NOFS); + if (!old_path) + goto out; + + rcu_read_lock(); + ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX); + rcu_read_unlock(); + if (ret < 0) + goto out; + + ret = kern_path(old_path, LOOKUP_FOLLOW, &old); + if (ret) + goto out; + ret = kern_path(new_path, LOOKUP_FOLLOW, &new); + if (ret) + goto out; + if (path_equal(&old, &new)) + is_same = true; +out: + kfree(old_path); + path_put(&old); + path_put(&new); + return is_same; +} + +/* * Add new device to list of registered devices * * Returns: @@ -852,7 +960,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, MAJOR(path_devt), MINOR(path_devt), current->comm, task_pid_nr(current)); - } else if (!device->name || strcmp(device->name->str, path)) { + } else if (!device->name || !is_same_device(device, path)) { /* * When FS is already mounted. * 1. If you are here and if the device->name is NULL that @@ -1383,12 +1491,23 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, bool new_device_added = false; struct btrfs_device *device = NULL; struct file *bdev_file; + char *canonical_path = NULL; u64 bytenr; dev_t devt; int ret; lockdep_assert_held(&uuid_mutex); + if (!is_good_dev_path(path)) { + canonical_path = kmalloc(PATH_MAX, GFP_KERNEL); + if (canonical_path) { + ret = get_canonical_dev_path(path, canonical_path); + if (ret < 0) { + kfree(canonical_path); + canonical_path = NULL; + } + } + } /* * Avoid an exclusive open here, as the systemd-udev may initiate the * device scan which may race with the user's mount or mkfs command, @@ -1433,7 +1552,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, goto free_disk_super; } - device = device_list_add(path, disk_super, &new_device_added); + device = device_list_add(canonical_path ? : path, disk_super, + &new_device_added); if (!IS_ERR(device) && new_device_added) btrfs_free_stale_devices(device->devt, device); @@ -1442,6 +1562,7 @@ free_disk_super: error_bdev_put: fput(bdev_file); + kfree(canonical_path); return device; } @@ -2721,8 +2842,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE); if (seeding_dev) { - btrfs_clear_sb_rdonly(sb); - /* GFP_KERNEL allocation must not be under device_list_mutex */ seed_devices = btrfs_init_sprout(fs_info); if (IS_ERR(seed_devices)) { @@ -2865,8 +2984,6 @@ error_sysfs: mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); error_trans: - if (seeding_dev) - btrfs_set_sb_rdonly(sb); if (trans) btrfs_end_transaction(trans); error_free_zone: @@ -5310,7 +5427,7 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; - /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ + /* stripe_size is fixed in zoned filesystem. Reduce ndevs instead. */ if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, ctl->stripe_size) + ctl->nparity, @@ -5842,24 +5959,6 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, return len; } -int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) -{ - struct btrfs_chunk_map *map; - int ret = 0; - - if (!btrfs_fs_incompat(fs_info, RAID56)) - return 0; - - map = btrfs_get_chunk_map(fs_info, logical, len); - - if (!WARN_ON(IS_ERR(map))) { - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) - ret = 1; - btrfs_free_chunk_map(map); - } - return ret; -} - static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, int first, int dev_replace_is_ongoing) @@ -5920,9 +6019,9 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, return preferred_mirror; } -static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, - u64 logical, - u16 total_stripes) +EXPORT_FOR_TESTS +struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, + u64 logical, u16 total_stripes) { struct btrfs_io_context *bioc; @@ -6481,13 +6580,15 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, max_len = btrfs_max_io_len(map, map_offset, &io_geom); *length = min_t(u64, map->chunk_len - map_offset, max_len); - down_read(&dev_replace->rwsem); + if (dev_replace->replace_task != current) + down_read(&dev_replace->rwsem); + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); /* * Hold the semaphore for read during the whole operation, write is * requested at commit time but must wait. */ - if (!dev_replace_is_ongoing) + if (!dev_replace_is_ongoing && dev_replace->replace_task != current) up_read(&dev_replace->rwsem); switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { @@ -6627,7 +6728,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, bioc->mirror_num = io_geom.mirror_num; out: - if (dev_replace_is_ongoing) { + if (dev_replace_is_ongoing && dev_replace->replace_task != current) { lockdep_assert_held(&dev_replace->rwsem); /* Unlock and let waiting writers proceed */ up_read(&dev_replace->rwsem); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 4481575dd70f..3a416b1bc24c 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -306,7 +306,7 @@ enum btrfs_read_policy { BTRFS_NR_READ_POLICY, }; -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL /* * Checksum mode - offload it to workqueues or do it synchronously in * btrfs_submit_chunk(). @@ -430,7 +430,7 @@ struct btrfs_fs_devices { /* Policy used to read the mirrored stripes. */ enum btrfs_read_policy read_policy; -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL /* Checksum mode - offload it or do it synchronously. */ enum btrfs_offload_csum_mode offload_csum_mode; #endif @@ -741,8 +741,6 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans); void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev); void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev); void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev); -int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, - u64 logical, u64 len); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical); u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map); @@ -840,4 +838,9 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, + u64 logical, u16 total_stripes); +#endif + #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index ce464cd8e0ac..bc18710d1dcf 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -85,7 +85,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, { struct btrfs_dir_item *di = NULL; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; size_t name_len = strlen(name); int ret = 0; @@ -143,14 +142,14 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, */ ret = 0; btrfs_assert_tree_write_locked(path->nodes[0]); - di = btrfs_match_dir_item_name(fs_info, path, name, name_len); + di = btrfs_match_dir_item_name(path, name, name_len); if (!di && !(flags & XATTR_REPLACE)) { ret = -ENOSPC; goto out; } } else if (ret == -EEXIST) { ret = 0; - di = btrfs_match_dir_item_name(fs_info, path, name, name_len); + di = btrfs_match_dir_item_name(path, name, name_len); ASSERT(di); /* logic error */ } else if (ret) { goto out; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 100abc00b794..ddf0d5a448a7 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -194,7 +194,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, pg_off = offset_in_page(start); cur_len = btrfs_calc_input_length(orig_end, start); data_in = kmap_local_folio(in_folio, pg_off); - start += PAGE_SIZE; + start += cur_len; workspace->strm.next_in = data_in; workspace->strm.avail_in = cur_len; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 69d03feea4e0..cb32966380f5 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1739,7 +1739,7 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio) return false; /* - * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the + * Using REQ_OP_ZONE_APPEND for relocation can break assumptions on the * extent layout the relocation code has. * Furthermore we have set aside own block-group from which only the * relocation "process" can allocate and make sure only one process at a @@ -1973,7 +1973,7 @@ int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, if (block_group->meta_write_pointer > eb->start) return -EBUSY; - /* If for_sync, this hole will be filled with trasnsaction commit. */ + /* If for_sync, this hole will be filled with transaction commit. */ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) return -EAGAIN; return -EBUSY; diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 866607fd3e58..5232b56d5892 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -111,6 +111,8 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES; struct list_head *pos, *next; + ASSERT(timer == &wsm.timer); + spin_lock(&wsm.lock); if (list_empty(&wsm.lru_list)) { @@ -495,7 +497,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, /* Check if we need more input */ if (workspace->in_buf.pos == workspace->in_buf.size) { - tot_in += PAGE_SIZE; + tot_in += workspace->in_buf.size; kunmap_local(workspace->in_buf.src); workspace->in_buf.src = NULL; folio_put(in_folio); |