From 597441b3436a43011f31ce71dc0a6c0bf5ce958a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 11 May 2023 12:45:59 -0400
Subject: btrfs: use nofs when cleaning up aborted transactions

Our CI system caught a lockdep splat:

  ======================================================
  WARNING: possible circular locking dependency detected
  6.3.0-rc7+ #1167 Not tainted
  ------------------------------------------------------
  kswapd0/46 is trying to acquire lock:
  ffff8c6543abd650 (sb_internal#2){++++}-{0:0}, at: btrfs_commit_inode_delayed_inode+0x5f/0x120

  but task is already holding lock:
  ffffffffabe61b40 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x4aa/0x7a0

  which lock already depends on the new lock.

  the existing dependency chain (in reverse order) is:

  -> #1 (fs_reclaim){+.+.}-{0:0}:
	 fs_reclaim_acquire+0xa5/0xe0
	 kmem_cache_alloc+0x31/0x2c0
	 alloc_extent_state+0x1d/0xd0
	 __clear_extent_bit+0x2e0/0x4f0
	 try_release_extent_mapping+0x216/0x280
	 btrfs_release_folio+0x2e/0x90
	 invalidate_inode_pages2_range+0x397/0x470
	 btrfs_cleanup_dirty_bgs+0x9e/0x210
	 btrfs_cleanup_one_transaction+0x22/0x760
	 btrfs_commit_transaction+0x3b7/0x13a0
	 create_subvol+0x59b/0x970
	 btrfs_mksubvol+0x435/0x4f0
	 __btrfs_ioctl_snap_create+0x11e/0x1b0
	 btrfs_ioctl_snap_create_v2+0xbf/0x140
	 btrfs_ioctl+0xa45/0x28f0
	 __x64_sys_ioctl+0x88/0xc0
	 do_syscall_64+0x38/0x90
	 entry_SYSCALL_64_after_hwframe+0x72/0xdc

  -> #0 (sb_internal#2){++++}-{0:0}:
	 __lock_acquire+0x1435/0x21a0
	 lock_acquire+0xc2/0x2b0
	 start_transaction+0x401/0x730
	 btrfs_commit_inode_delayed_inode+0x5f/0x120
	 btrfs_evict_inode+0x292/0x3d0
	 evict+0xcc/0x1d0
	 inode_lru_isolate+0x14d/0x1e0
	 __list_lru_walk_one+0xbe/0x1c0
	 list_lru_walk_one+0x58/0x80
	 prune_icache_sb+0x39/0x60
	 super_cache_scan+0x161/0x1f0
	 do_shrink_slab+0x163/0x340
	 shrink_slab+0x1d3/0x290
	 shrink_node+0x300/0x720
	 balance_pgdat+0x35c/0x7a0
	 kswapd+0x205/0x410
	 kthread+0xf0/0x120
	 ret_from_fork+0x29/0x50

  other info that might help us debug this:

   Possible unsafe locking scenario:

	 CPU0                    CPU1
	 ----                    ----
    lock(fs_reclaim);
				 lock(sb_internal#2);
				 lock(fs_reclaim);
    lock(sb_internal#2);

   *** DEADLOCK ***

  3 locks held by kswapd0/46:
   #0: ffffffffabe61b40 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x4aa/0x7a0
   #1: ffffffffabe50270 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x113/0x290
   #2: ffff8c6543abd0e0 (&type->s_umount_key#44){++++}-{3:3}, at: super_cache_scan+0x38/0x1f0

  stack backtrace:
  CPU: 0 PID: 46 Comm: kswapd0 Not tainted 6.3.0-rc7+ #1167
  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014
  Call Trace:
   <TASK>
   dump_stack_lvl+0x58/0x90
   check_noncircular+0xd6/0x100
   ? save_trace+0x3f/0x310
   ? add_lock_to_list+0x97/0x120
   __lock_acquire+0x1435/0x21a0
   lock_acquire+0xc2/0x2b0
   ? btrfs_commit_inode_delayed_inode+0x5f/0x120
   start_transaction+0x401/0x730
   ? btrfs_commit_inode_delayed_inode+0x5f/0x120
   btrfs_commit_inode_delayed_inode+0x5f/0x120
   btrfs_evict_inode+0x292/0x3d0
   ? lock_release+0x134/0x270
   ? __pfx_wake_bit_function+0x10/0x10
   evict+0xcc/0x1d0
   inode_lru_isolate+0x14d/0x1e0
   __list_lru_walk_one+0xbe/0x1c0
   ? __pfx_inode_lru_isolate+0x10/0x10
   ? __pfx_inode_lru_isolate+0x10/0x10
   list_lru_walk_one+0x58/0x80
   prune_icache_sb+0x39/0x60
   super_cache_scan+0x161/0x1f0
   do_shrink_slab+0x163/0x340
   shrink_slab+0x1d3/0x290
   shrink_node+0x300/0x720
   balance_pgdat+0x35c/0x7a0
   kswapd+0x205/0x410
   ? __pfx_autoremove_wake_function+0x10/0x10
   ? __pfx_kswapd+0x10/0x10
   kthread+0xf0/0x120
   ? __pfx_kthread+0x10/0x10
   ret_from_fork+0x29/0x50
   </TASK>

This happens because when we abort the transaction in the transaction
commit path we call invalidate_inode_pages2_range on our block group
cache inodes (if we have space cache v1) and any delalloc inodes we may
have.  The plain invalidate_inode_pages2_range() call passes through
GFP_KERNEL, which makes sense in most cases, but not here.  Wrap these
two invalidate callees with memalloc_nofs_save/memalloc_nofs_restore to
make sure we don't end up with the fs reclaim dependency under the
transaction dependency.

CC: stable@vger.kernel.org # 4.14+
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs/btrfs/disk-io.c')
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fbf9006c6234..6de6dcf2743e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4936,7 +4936,11 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
 		 */
 		inode = igrab(&btrfs_inode->vfs_inode);
 		if (inode) {
+			unsigned int nofs_flag;
+
+			nofs_flag = memalloc_nofs_save();
 			invalidate_inode_pages2(inode->i_mapping);
+			memalloc_nofs_restore(nofs_flag);
 			iput(inode);
 		}
 		spin_lock(&root->delalloc_lock);
@@ -5042,7 +5046,12 @@ static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
 
 	inode = cache->io_ctl.inode;
 	if (inode) {
+		unsigned int nofs_flag;
+
+		nofs_flag = memalloc_nofs_save();
 		invalidate_inode_pages2(inode->i_mapping);
+		memalloc_nofs_restore(nofs_flag);
+
 		BTRFS_I(inode)->generation = 0;
 		cache->io_ctl.inode = NULL;
 		iput(inode);
-- 
cgit v1.2.3


From 5ad9b4719fc9bc4715c7e19875a962095b0577e7 Mon Sep 17 00:00:00 2001
From: pengfuyuan <pengfuyuan@kylinos.cn>
Date: Tue, 23 May 2023 15:09:55 +0800
Subject: btrfs: fix csum_tree_block page iteration to avoid tripping on
 -Werror=array-bounds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When compiling on a MIPS 64-bit machine we get these warnings:

    In file included from ./arch/mips/include/asm/cacheflush.h:13,
	             from ./include/linux/cacheflush.h:5,
	             from ./include/linux/highmem.h:8,
		     from ./include/linux/bvec.h:10,
		     from ./include/linux/blk_types.h:10,
                     from ./include/linux/blkdev.h:9,
	             from fs/btrfs/disk-io.c:7:
    fs/btrfs/disk-io.c: In function ‘csum_tree_block’:
    fs/btrfs/disk-io.c:100:34: error: array subscript 1 is above array bounds of ‘struct page *[1]’ [-Werror=array-bounds]
      100 |   kaddr = page_address(buf->pages[i]);
          |                        ~~~~~~~~~~^~~
    ./include/linux/mm.h:2135:48: note: in definition of macro ‘page_address’
     2135 | #define page_address(page) lowmem_page_address(page)
          |                                                ^~~~
    cc1: all warnings being treated as errors

We can check if i overflows to solve the problem. However, this doesn't make
much sense, since i == 1 and num_pages == 1 doesn't execute the body of the loop.
In addition, i < num_pages can also ensure that buf->pages[i] will not cross
the boundary. Unfortunately, this doesn't help with the problem observed here:
gcc still complains.

To fix this add a compile-time condition for the extent buffer page
array size limit, which would eventually lead to eliminating the whole
for loop.

CC: stable@vger.kernel.org # 5.10+
Signed-off-by: pengfuyuan <pengfuyuan@kylinos.cn>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6de6dcf2743e..2b1b227505f3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -96,7 +96,7 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
 	crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
 			    first_page_part - BTRFS_CSUM_SIZE);
 
-	for (i = 1; i < num_pages; i++) {
+	for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) {
 		kaddr = page_address(buf->pages[i]);
 		crypto_shash_update(shash, kaddr, PAGE_SIZE);
 	}
-- 
cgit v1.2.3


From 917ac77846b907dfdbd878688a9a61236ad6c51e Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 26 May 2023 20:30:20 +0800
Subject: btrfs: subpage: fix a crash in metadata repair path

[BUG]
Test case btrfs/027 would crash with subpage (64K page size, 4K
sectorsize) with the following dying messages:

  debug: map_length=16384 length=65536 type=metadata|raid6(0x104)
  assertion failed: map_length >= length, in fs/btrfs/volumes.c:8093
  ------------[ cut here ]------------
  kernel BUG at fs/btrfs/messages.c:259!
  Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
  Call trace:
   btrfs_assertfail+0x28/0x2c [btrfs]
   btrfs_map_repair_block+0x150/0x2b8 [btrfs]
   btrfs_repair_io_failure+0xd4/0x31c [btrfs]
   btrfs_read_extent_buffer+0x150/0x16c [btrfs]
   read_tree_block+0x38/0xbc [btrfs]
   read_tree_root_path+0xfc/0x1bc [btrfs]
   btrfs_get_root_ref.part.0+0xd4/0x3a8 [btrfs]
   open_ctree+0xa30/0x172c [btrfs]
   btrfs_mount_root+0x3c4/0x4a4 [btrfs]
   legacy_get_tree+0x30/0x60
   vfs_get_tree+0x28/0xec
   vfs_kern_mount.part.0+0x90/0xd4
   vfs_kern_mount+0x14/0x28
   btrfs_mount+0x114/0x418 [btrfs]
   legacy_get_tree+0x30/0x60
   vfs_get_tree+0x28/0xec
   path_mount+0x3e0/0xb64
   __arm64_sys_mount+0x200/0x2d8
   invoke_syscall+0x48/0x114
   el0_svc_common.constprop.0+0x60/0x11c
   do_el0_svc+0x38/0x98
   el0_svc+0x40/0xa8
   el0t_64_sync_handler+0xf4/0x120
   el0t_64_sync+0x190/0x194
  Code: aa0403e2 b0fff060 91010000 959c2024 (d4210000)

[CAUSE]
In btrfs/027 we test RAID6 with missing devices, in this particular
case, we're repairing a metadata at the end of a data stripe.

But at btrfs_repair_io_failure(), we always pass a full PAGE for repair,
and for subpage case this can cross stripe boundary and lead to the
above BUG_ON().

This metadata repair code is always there, since the introduction of
subpage support, but this can trigger BUG_ON() after the bio split
ability at btrfs_map_bio().

[FIX]
Instead of passing the old PAGE_SIZE, we calculate the correct length
based on the eb size and page size for both regular and subpage cases.

CC: stable@vger.kernel.org # 6.3+
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2b1b227505f3..88e6d1072a35 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -242,7 +242,6 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
 				      int mirror_num)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
-	u64 start = eb->start;
 	int i, num_pages = num_extent_pages(eb);
 	int ret = 0;
 
@@ -251,12 +250,14 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
 
 	for (i = 0; i < num_pages; i++) {
 		struct page *p = eb->pages[i];
+		u64 start = max_t(u64, eb->start, page_offset(p));
+		u64 end = min_t(u64, eb->start + eb->len, page_offset(p) + PAGE_SIZE);
+		u32 len = end - start;
 
-		ret = btrfs_repair_io_failure(fs_info, 0, start, PAGE_SIZE,
-				start, p, start - page_offset(p), mirror_num);
+		ret = btrfs_repair_io_failure(fs_info, 0, start, len,
+				start, p, offset_in_page(start), mirror_num);
 		if (ret)
 			break;
-		start += PAGE_SIZE;
 	}
 
 	return ret;
-- 
cgit v1.2.3


From 745806fb4554f334e6406fa82b328562aa48f08f Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Sun, 11 Jun 2023 08:09:13 +0800
Subject: btrfs: do not ASSERT() on duplicated global roots

[BUG]
Syzbot reports a reproducible ASSERT() when using rescue=usebackuproot
mount option on a corrupted fs.

The full report can be found here:
https://syzkaller.appspot.com/bug?extid=c4614eae20a166c25bf0

  BTRFS error (device loop0: state C): failed to load root csum
  assertion failed: !tmp, in fs/btrfs/disk-io.c:1103
  ------------[ cut here ]------------
  kernel BUG at fs/btrfs/ctree.h:3664!
  invalid opcode: 0000 [#1] PREEMPT SMP KASAN
  CPU: 1 PID: 3608 Comm: syz-executor356 Not tainted 6.0.0-rc7-syzkaller-00029-g3800a713b607 #0
  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/26/2022
  RIP: 0010:assertfail+0x1a/0x1c fs/btrfs/ctree.h:3663
  RSP: 0018:ffffc90003aaf250 EFLAGS: 00010246
  RAX: 0000000000000032 RBX: 0000000000000000 RCX: f21c13f886638400
  RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000
  RBP: ffff888021c640a0 R08: ffffffff816bd38d R09: ffffed10173667f1
  R10: ffffed10173667f1 R11: 1ffff110173667f0 R12: dffffc0000000000
  R13: ffff8880229c21f7 R14: ffff888021c64060 R15: ffff8880226c0000
  FS:  0000555556a73300(0000) GS:ffff8880b9b00000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 000055a2637d7a00 CR3: 00000000709c4000 CR4: 00000000003506e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   <TASK>
   btrfs_global_root_insert+0x1a7/0x1b0 fs/btrfs/disk-io.c:1103
   load_global_roots_objectid+0x482/0x8c0 fs/btrfs/disk-io.c:2467
   load_global_roots fs/btrfs/disk-io.c:2501 [inline]
   btrfs_read_roots fs/btrfs/disk-io.c:2528 [inline]
   init_tree_roots+0xccb/0x203c fs/btrfs/disk-io.c:2939
   open_ctree+0x1e53/0x33df fs/btrfs/disk-io.c:3574
   btrfs_fill_super+0x1c6/0x2d0 fs/btrfs/super.c:1456
   btrfs_mount_root+0x885/0x9a0 fs/btrfs/super.c:1824
   legacy_get_tree+0xea/0x180 fs/fs_context.c:610
   vfs_get_tree+0x88/0x270 fs/super.c:1530
   fc_mount fs/namespace.c:1043 [inline]
   vfs_kern_mount+0xc9/0x160 fs/namespace.c:1073
   btrfs_mount+0x3d3/0xbb0 fs/btrfs/super.c:1884

[CAUSE]
Since the introduction of global roots, we handle
csum/extent/free-space-tree roots as global roots, even if no
extent-tree-v2 feature is enabled.

So for regular csum/extent/fst roots, we load them into
fs_info::global_root_tree rb tree.

And we should not expect any conflicts in that rb tree, thus we have an
ASSERT() inside btrfs_global_root_insert().

But rescue=usebackuproot can break the assumption, as we will try to
load those trees again and again as long as we have bad roots and have
backup roots slot remaining.

So in that case we can have conflicting roots in the rb tree, and
triggering the ASSERT() crash.

[FIX]
We can safely remove that ASSERT(), as the caller will properly put the
offending root.

To make further debugging easier, also add two explicit error messages:

- Error message for conflicting global roots
- Error message when using backup roots slot

Reported-by: syzbot+a694851c6ab28cbcfb9c@syzkaller.appspotmail.com
Fixes: abed4aaae4f7 ("btrfs: track the csum, extent, and free space trees in a rb tree")
CC: stable@vger.kernel.org # 6.1+
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 88e6d1072a35..dabc79c1af1b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -996,13 +996,18 @@ int btrfs_global_root_insert(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct rb_node *tmp;
+	int ret = 0;
 
 	write_lock(&fs_info->global_root_lock);
 	tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp);
 	write_unlock(&fs_info->global_root_lock);
-	ASSERT(!tmp);
 
-	return tmp ? -EEXIST : 0;
+	if (tmp) {
+		ret = -EEXIST;
+		btrfs_warn(fs_info, "global root %llu %llu already exists",
+				root->root_key.objectid, root->root_key.offset);
+	}
+	return ret;
 }
 
 void btrfs_global_root_delete(struct btrfs_root *root)
@@ -2842,6 +2847,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
 			/* We can't trust the free space cache either */
 			btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
 
+			btrfs_warn(fs_info, "try to load backup roots slot %d", i);
 			ret = read_backup_root(fs_info, i);
 			backup_index = ret;
 			if (ret < 0)
-- 
cgit v1.2.3


From 8bfec2e426e40597d594db07de4e53def38c8879 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 3 May 2023 09:06:15 +0200
Subject: btrfs: remove hipri_workers workqueue

Now that btrfs_wq_submit_bio is never called for synchronous I/O,
the hipri_workers workqueue is not used anymore and can be removed.

Reviewed-by: Chris Mason <clm@fb.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/bio.c     | 5 +----
 fs/btrfs/disk-io.c | 6 +-----
 fs/btrfs/fs.h      | 1 -
 fs/btrfs/super.c   | 1 -
 4 files changed, 2 insertions(+), 11 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 990a517c069b..2e2e1abd5838 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -615,10 +615,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
 
 	btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
 			run_one_async_free);
-	if (op_is_sync(bbio->bio.bi_opf))
-		btrfs_queue_work(fs_info->hipri_workers, &async->work);
-	else
-		btrfs_queue_work(fs_info->workers, &async->work);
+	btrfs_queue_work(fs_info->workers, &async->work);
 	return true;
 }
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dabc79c1af1b..f4adda2b4317 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1991,7 +1991,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 {
 	btrfs_destroy_workqueue(fs_info->fixup_workers);
 	btrfs_destroy_workqueue(fs_info->delalloc_workers);
-	btrfs_destroy_workqueue(fs_info->hipri_workers);
 	btrfs_destroy_workqueue(fs_info->workers);
 	if (fs_info->endio_workers)
 		destroy_workqueue(fs_info->endio_workers);
@@ -2186,9 +2185,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
 
 	fs_info->workers =
 		btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
-	fs_info->hipri_workers =
-		btrfs_alloc_workqueue(fs_info, "worker-high",
-				      flags | WQ_HIGHPRI, max_active, 16);
 
 	fs_info->delalloc_workers =
 		btrfs_alloc_workqueue(fs_info, "delalloc",
@@ -2225,7 +2221,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
 	fs_info->discard_ctl.discard_workers =
 		alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
 
-	if (!(fs_info->workers && fs_info->hipri_workers &&
+	if (!(fs_info->workers &&
 	      fs_info->delalloc_workers && fs_info->flush_workers &&
 	      fs_info->endio_workers && fs_info->endio_meta_workers &&
 	      fs_info->compressed_write_workers &&
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 0d98fc5f6f44..840e4def18b5 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -543,7 +543,6 @@ struct btrfs_fs_info {
 	 * A third pool does submit_bio to avoid deadlocking with the other two.
 	 */
 	struct btrfs_workqueue *workers;
-	struct btrfs_workqueue *hipri_workers;
 	struct btrfs_workqueue *delalloc_workers;
 	struct btrfs_workqueue *flush_workers;
 	struct workqueue_struct *endio_workers;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index efeb1a9d040a..8b1c1225245e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1631,7 +1631,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 	       old_pool_size, new_pool_size);
 
 	btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
-	btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
 	workqueue_set_max_active(fs_info->endio_workers, new_pool_size);
-- 
cgit v1.2.3


From 85d8a826c7cde17f9cca9c4debecb4538bdb6573 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Sat, 29 Apr 2023 16:07:12 -0400
Subject: btrfs: simplify btrfs_check_leaf_* helpers into a single helper

We have two helpers for checking leaves, because we have an extra check
for debugging in btrfs_mark_buffer_dirty(), and at that stage we may
have item data that isn't consistent yet.  However we can handle this
case internally in the helper, if BTRFS_HEADER_FLAG_WRITTEN is set we
know the buffer should be internally consistent, otherwise we need to
skip checking the item data.

Simplify this helper down a single helper and handle the item data
checking logic internally to the helper.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c      | 12 +++++-------
 fs/btrfs/tree-checker.c | 20 +++++++-------------
 fs/btrfs/tree-checker.h | 13 +------------
 3 files changed, 13 insertions(+), 32 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f4adda2b4317..922755926ee9 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -326,7 +326,7 @@ static int csum_one_extent_buffer(struct extent_buffer *eb)
 	if (btrfs_header_level(eb))
 		ret = btrfs_check_node(eb);
 	else
-		ret = btrfs_check_leaf_full(eb);
+		ret = btrfs_check_leaf(eb);
 
 	if (ret < 0)
 		goto error;
@@ -583,7 +583,7 @@ static int validate_extent_buffer(struct extent_buffer *eb,
 	 * that we don't try and read the other copies of this block, just
 	 * return -EIO.
 	 */
-	if (found_level == 0 && btrfs_check_leaf_full(eb)) {
+	if (found_level == 0 && btrfs_check_leaf(eb)) {
 		set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 		ret = -EIO;
 	}
@@ -4701,12 +4701,10 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 					 fs_info->dirty_metadata_batch);
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 	/*
-	 * Since btrfs_mark_buffer_dirty() can be called with item pointer set
-	 * but item data not updated.
-	 * So here we should only check item pointers, not item data.
+	 * btrfs_check_leaf() won't check item data if we don't have WRITTEN
+	 * set, so this will only validate the basic structure of the items.
 	 */
-	if (btrfs_header_level(buf) == 0 &&
-	    btrfs_check_leaf_relaxed(buf)) {
+	if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(buf)) {
 		btrfs_print_leaf(buf);
 		ASSERT(0);
 	}
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index e2b54793bf0c..2eff4e2f2c47 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1674,7 +1674,7 @@ static int check_leaf_item(struct extent_buffer *leaf,
 	return ret;
 }
 
-static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
+int btrfs_check_leaf(struct extent_buffer *leaf)
 {
 	struct btrfs_fs_info *fs_info = leaf->fs_info;
 	/* No valid key type is 0, so all key should be larger than this key */
@@ -1807,7 +1807,11 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
 			return -EUCLEAN;
 		}
 
-		if (check_item_data) {
+		/*
+		 * We only want to do this if WRITTEN is set, otherwise the leaf
+		 * may be in some intermediate state and won't appear valid.
+		 */
+		if (btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_WRITTEN)) {
 			/*
 			 * Check if the item size and content meet other
 			 * criteria
@@ -1824,17 +1828,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
 
 	return 0;
 }
-
-int btrfs_check_leaf_full(struct extent_buffer *leaf)
-{
-	return check_leaf(leaf, true);
-}
-ALLOW_ERROR_INJECTION(btrfs_check_leaf_full, ERRNO);
-
-int btrfs_check_leaf_relaxed(struct extent_buffer *leaf)
-{
-	return check_leaf(leaf, false);
-}
+ALLOW_ERROR_INJECTION(btrfs_check_leaf, ERRNO);
 
 int btrfs_check_node(struct extent_buffer *node)
 {
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index bfb5efa4e01f..48321e8d91bb 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -40,18 +40,7 @@ struct btrfs_tree_parent_check {
 	u8 level;
 };
 
-/*
- * Comprehensive leaf checker.
- * Will check not only the item pointers, but also every possible member
- * in item data.
- */
-int btrfs_check_leaf_full(struct extent_buffer *leaf);
-
-/*
- * Less strict leaf checker.
- * Will only check item pointers, not reading item data.
- */
-int btrfs_check_leaf_relaxed(struct extent_buffer *leaf);
+int btrfs_check_leaf(struct extent_buffer *leaf);
 int btrfs_check_node(struct extent_buffer *node);
 
 int btrfs_check_chunk_valid(struct extent_buffer *leaf,
-- 
cgit v1.2.3


From 2cac5af16537f8eafaba5e525fbb5a93160ebaff Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Sat, 29 Apr 2023 16:07:17 -0400
Subject: btrfs: move btrfs_verify_level_key into tree-checker.c

This is more a buffer validation helper, move it into the tree-checker
files where it makes more sense.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c      | 58 -------------------------------------------------
 fs/btrfs/disk-io.h      |  2 --
 fs/btrfs/tree-checker.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/tree-checker.h |  2 ++
 4 files changed, 60 insertions(+), 60 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 922755926ee9..83518ed71bfd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -180,64 +180,6 @@ int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
-int btrfs_verify_level_key(struct extent_buffer *eb, int level,
-			   struct btrfs_key *first_key, u64 parent_transid)
-{
-	struct btrfs_fs_info *fs_info = eb->fs_info;
-	int found_level;
-	struct btrfs_key found_key;
-	int ret;
-
-	found_level = btrfs_header_level(eb);
-	if (found_level != level) {
-		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
-		     KERN_ERR "BTRFS: tree level check failed\n");
-		btrfs_err(fs_info,
-"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
-			  eb->start, level, found_level);
-		return -EIO;
-	}
-
-	if (!first_key)
-		return 0;
-
-	/*
-	 * For live tree block (new tree blocks in current transaction),
-	 * we need proper lock context to avoid race, which is impossible here.
-	 * So we only checks tree blocks which is read from disk, whose
-	 * generation <= fs_info->last_trans_committed.
-	 */
-	if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
-		return 0;
-
-	/* We have @first_key, so this @eb must have at least one item */
-	if (btrfs_header_nritems(eb) == 0) {
-		btrfs_err(fs_info,
-		"invalid tree nritems, bytenr=%llu nritems=0 expect >0",
-			  eb->start);
-		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
-		return -EUCLEAN;
-	}
-
-	if (found_level)
-		btrfs_node_key_to_cpu(eb, &found_key, 0);
-	else
-		btrfs_item_key_to_cpu(eb, &found_key, 0);
-	ret = btrfs_comp_cpu_keys(first_key, &found_key);
-
-	if (ret) {
-		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
-		     KERN_ERR "BTRFS: tree first key check failed\n");
-		btrfs_err(fs_info,
-"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
-			  eb->start, parent_transid, first_key->objectid,
-			  first_key->type, first_key->offset,
-			  found_key.objectid, found_key.type,
-			  found_key.offset);
-	}
-	return ret;
-}
-
 static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
 				      int mirror_num)
 {
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 4d5772330110..a26ab3cbb449 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -31,8 +31,6 @@ struct btrfs_tree_parent_check;
 
 void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info);
 void btrfs_init_fs_info(struct btrfs_fs_info *fs_info);
-int btrfs_verify_level_key(struct extent_buffer *eb, int level,
-			   struct btrfs_key *first_key, u64 parent_transid);
 struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
 				      struct btrfs_tree_parent_check *check);
 struct extent_buffer *btrfs_find_create_tree_block(
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index bd85205a6249..9e92548a6aa2 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1963,3 +1963,61 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
 	}
 	return 0;
 }
+
+int btrfs_verify_level_key(struct extent_buffer *eb, int level,
+			   struct btrfs_key *first_key, u64 parent_transid)
+{
+	struct btrfs_fs_info *fs_info = eb->fs_info;
+	int found_level;
+	struct btrfs_key found_key;
+	int ret;
+
+	found_level = btrfs_header_level(eb);
+	if (found_level != level) {
+		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
+		     KERN_ERR "BTRFS: tree level check failed\n");
+		btrfs_err(fs_info,
+"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
+			  eb->start, level, found_level);
+		return -EIO;
+	}
+
+	if (!first_key)
+		return 0;
+
+	/*
+	 * For live tree block (new tree blocks in current transaction),
+	 * we need proper lock context to avoid race, which is impossible here.
+	 * So we only checks tree blocks which is read from disk, whose
+	 * generation <= fs_info->last_trans_committed.
+	 */
+	if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
+		return 0;
+
+	/* We have @first_key, so this @eb must have at least one item */
+	if (btrfs_header_nritems(eb) == 0) {
+		btrfs_err(fs_info,
+		"invalid tree nritems, bytenr=%llu nritems=0 expect >0",
+			  eb->start);
+		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+		return -EUCLEAN;
+	}
+
+	if (found_level)
+		btrfs_node_key_to_cpu(eb, &found_key, 0);
+	else
+		btrfs_item_key_to_cpu(eb, &found_key, 0);
+	ret = btrfs_comp_cpu_keys(first_key, &found_key);
+
+	if (ret) {
+		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
+		     KERN_ERR "BTRFS: tree first key check failed\n");
+		btrfs_err(fs_info,
+"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
+			  eb->start, parent_transid, first_key->objectid,
+			  first_key->type, first_key->offset,
+			  found_key.objectid, found_key.type,
+			  found_key.offset);
+	}
+	return ret;
+}
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index c0861ce1429b..3c2a02a72f64 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -66,5 +66,7 @@ int btrfs_check_node(struct extent_buffer *node);
 int btrfs_check_chunk_valid(struct extent_buffer *leaf,
 			    struct btrfs_chunk *chunk, u64 logical);
 int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner);
+int btrfs_verify_level_key(struct extent_buffer *eb, int level,
+			   struct btrfs_key *first_key, u64 parent_transid);
 
 #endif
-- 
cgit v1.2.3


From f18cc97845aa4ae0e795c088c979fe1642b3b8e5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 May 2023 07:58:38 -0700
Subject: btrfs: fix dirty_metadata_bytes for redirtied buffers

dirty_metadata_bytes is decremented in both places that clear the dirty
bit in a buffer, but only incremented in btrfs_mark_buffer_dirty, which
means that a buffer that is redirtied using btrfs_redirty_list_add won't
be added to dirty_metadata_bytes, but it will be subtracted when written
out, leading an inconsistency in the counter.

Move the dirty_metadata_bytes from btrfs_mark_buffer_dirty into
set_extent_buffer_dirty to also account for the redirty case, and remove
the now unused set_extent_buffer_dirty return value.

Fixes: d3575156f662 ("btrfs: zoned: redirty released extent buffers")
CC: stable@vger.kernel.org # 5.15+
Reviewed-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c   | 7 +------
 fs/btrfs/extent_io.c | 7 ++++---
 fs/btrfs/extent_io.h | 2 +-
 3 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 83518ed71bfd..112c99dde57f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4621,7 +4621,6 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 {
 	struct btrfs_fs_info *fs_info = buf->fs_info;
 	u64 transid = btrfs_header_generation(buf);
-	int was_dirty;
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 	/*
@@ -4636,11 +4635,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 	if (transid != fs_info->generation)
 		WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
 			buf->start, transid, fs_info->generation);
-	was_dirty = set_extent_buffer_dirty(buf);
-	if (!was_dirty)
-		percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
-					 buf->len,
-					 fs_info->dirty_metadata_batch);
+	set_extent_buffer_dirty(buf);
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 	/*
 	 * btrfs_check_leaf() won't check item data if we don't have WRITTEN
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a1adadd5d25d..a829390632a5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4148,7 +4148,7 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
 	WARN_ON(atomic_read(&eb->refs) == 0);
 }
 
-bool set_extent_buffer_dirty(struct extent_buffer *eb)
+void set_extent_buffer_dirty(struct extent_buffer *eb)
 {
 	int i;
 	int num_pages;
@@ -4183,13 +4183,14 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb)
 					     eb->start, eb->len);
 		if (subpage)
 			unlock_page(eb->pages[0]);
+		percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes,
+					 eb->len,
+					 eb->fs_info->dirty_metadata_batch);
 	}
 #ifdef CONFIG_BTRFS_DEBUG
 	for (i = 0; i < num_pages; i++)
 		ASSERT(PageDirty(eb->pages[i]));
 #endif
-
-	return was_dirty;
 }
 
 void clear_extent_buffer_uptodate(struct extent_buffer *eb)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 4341ad978fb8..f937654230d3 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -262,7 +262,7 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star
 void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
 				unsigned long start, unsigned long pos,
 				unsigned long len);
-bool set_extent_buffer_dirty(struct extent_buffer *eb);
+void set_extent_buffer_dirty(struct extent_buffer *eb);
 void set_extent_buffer_uptodate(struct extent_buffer *eb);
 void clear_extent_buffer_uptodate(struct extent_buffer *eb);
 int extent_buffer_under_io(const struct extent_buffer *eb);
-- 
cgit v1.2.3


From f880fe6e0b4b127d6e2a007fe68bdf5651677ae2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 May 2023 07:58:39 -0700
Subject: btrfs: don't hold an extra reference for redirtied buffers

When btrfs_redirty_list_add redirties a buffer, it also acquires
an extra reference that is released on transaction commit.  But
this is not required as buffers that are dirty or under writeback
are never freed (look for calls to extent_buffer_under_io())).

Remove the extra reference and the infrastructure used to drop it
again.

History behind redirty logic:

In the first place, it used releasing_list to hold all the
to-be-released extent buffers, and decided which buffers to re-dirty at
the commit time. Then, in a later version, the behaviour got changed to
re-dirty a necessary buffer and add re-dirtied one to the list in
btrfs_free_tree_block(). In short, the list was there mostly for the
patch series' historical reason.

Reviewed-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
[ add Naohiro's comment regarding history ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c     |  2 --
 fs/btrfs/extent_io.c   |  1 -
 fs/btrfs/extent_io.h   |  1 -
 fs/btrfs/transaction.c |  9 ---------
 fs/btrfs/transaction.h |  3 ---
 fs/btrfs/zoned.c       | 28 ++++------------------------
 fs/btrfs/zoned.h       |  2 --
 7 files changed, 4 insertions(+), 42 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 112c99dde57f..16224fd82987 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -5073,8 +5073,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
 				     EXTENT_DIRTY);
 	btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
 
-	btrfs_free_redirty_list(cur_trans);
-
 	cur_trans->state =TRANS_STATE_COMPLETED;
 	wake_up(&cur_trans->commit_wait);
 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a829390632a5..d8becf1cdbc0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3557,7 +3557,6 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
 	init_rwsem(&eb->lock);
 
 	btrfs_leak_debug_add_eb(eb);
-	INIT_LIST_HEAD(&eb->release_list);
 
 	spin_lock_init(&eb->refs_lock);
 	atomic_set(&eb->refs, 1);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index f937654230d3..6f3cfadd232c 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -89,7 +89,6 @@ struct extent_buffer {
 	struct rw_semaphore lock;
 
 	struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
-	struct list_head release_list;
 #ifdef CONFIG_BTRFS_DEBUG
 	struct list_head leak_list;
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 27c616fdfae2..fe0f00e717a8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -374,8 +374,6 @@ loop:
 	spin_lock_init(&cur_trans->dirty_bgs_lock);
 	INIT_LIST_HEAD(&cur_trans->deleted_bgs);
 	spin_lock_init(&cur_trans->dropped_roots_lock);
-	INIT_LIST_HEAD(&cur_trans->releasing_ebs);
-	spin_lock_init(&cur_trans->releasing_ebs_lock);
 	list_add_tail(&cur_trans->list, &fs_info->trans_list);
 	extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
 			IO_TREE_TRANS_DIRTY_PAGES);
@@ -2482,13 +2480,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 		goto scrub_continue;
 	}
 
-	/*
-	 * At this point, we should have written all the tree blocks allocated
-	 * in this transaction. So it's now safe to free the redirtyied extent
-	 * buffers.
-	 */
-	btrfs_free_redirty_list(cur_trans);
-
 	ret = write_all_supers(fs_info, 0);
 	/*
 	 * the super is written, we can safely allow the tree-loggers
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index fa728ab80826..8e9fa23bd7fe 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -94,9 +94,6 @@ struct btrfs_transaction {
 	 */
 	atomic_t pending_ordered;
 	wait_queue_head_t pending_wait;
-
-	spinlock_t releasing_ebs_lock;
-	struct list_head releasing_ebs;
 };
 
 enum {
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index dac879fe2871..bda301a55cbe 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1602,37 +1602,17 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
 void btrfs_redirty_list_add(struct btrfs_transaction *trans,
 			    struct extent_buffer *eb)
 {
-	struct btrfs_fs_info *fs_info = eb->fs_info;
-
-	if (!btrfs_is_zoned(fs_info) ||
-	    btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) ||
-	    !list_empty(&eb->release_list))
+	if (!btrfs_is_zoned(eb->fs_info) ||
+	    btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN))
 		return;
 
+	ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+
 	memzero_extent_buffer(eb, 0, eb->len);
 	set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
 	set_extent_buffer_dirty(eb);
 	set_extent_bits_nowait(&trans->dirty_pages, eb->start,
 			       eb->start + eb->len - 1, EXTENT_DIRTY);
-
-	spin_lock(&trans->releasing_ebs_lock);
-	list_add_tail(&eb->release_list, &trans->releasing_ebs);
-	spin_unlock(&trans->releasing_ebs_lock);
-	atomic_inc(&eb->refs);
-}
-
-void btrfs_free_redirty_list(struct btrfs_transaction *trans)
-{
-	spin_lock(&trans->releasing_ebs_lock);
-	while (!list_empty(&trans->releasing_ebs)) {
-		struct extent_buffer *eb;
-
-		eb = list_first_entry(&trans->releasing_ebs,
-				      struct extent_buffer, release_list);
-		list_del_init(&eb->release_list);
-		free_extent_buffer(eb);
-	}
-	spin_unlock(&trans->releasing_ebs_lock);
 }
 
 bool btrfs_use_zone_append(struct btrfs_bio *bbio)
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index c0570d35fea2..3058ef559c98 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -54,7 +54,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new);
 void btrfs_calc_zone_unusable(struct btrfs_block_group *cache);
 void btrfs_redirty_list_add(struct btrfs_transaction *trans,
 			    struct extent_buffer *eb);
-void btrfs_free_redirty_list(struct btrfs_transaction *trans);
 bool btrfs_use_zone_append(struct btrfs_bio *bbio);
 void btrfs_record_physical_zoned(struct btrfs_bio *bbio);
 void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered);
@@ -179,7 +178,6 @@ static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { }
 
 static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans,
 					  struct extent_buffer *eb) { }
-static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { }
 
 static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio)
 {
-- 
cgit v1.2.3


From aebcc1596b5c37095385ecd930cf335254828538 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 3 May 2023 17:24:23 +0200
Subject: btrfs: move setting the buffer uptodate out of validate_extent_buffer

Setting the buffer uptodate in a function that is named as a validation
helper is a it confusing.  Move the call from validate_extent_buffer to
the one of its two callers that didn't already have a duplicate call
to set_extent_buffer_uptodate.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 16224fd82987..41045c900c2c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -533,9 +533,7 @@ static int validate_extent_buffer(struct extent_buffer *eb,
 	if (found_level > 0 && btrfs_check_node(eb))
 		ret = -EIO;
 
-	if (!ret)
-		set_extent_buffer_uptodate(eb);
-	else
+	if (ret)
 		btrfs_err(fs_info,
 		"read time tree block corruption detected on logical %llu mirror %u",
 			  eb->start, eb->read_mirror);
@@ -627,6 +625,8 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
 		goto err;
 	}
 	ret = validate_extent_buffer(eb, &bbio->parent_check);
+	if (!ret)
+		set_extent_buffer_uptodate(eb);
 err:
 	if (ret) {
 		/*
-- 
cgit v1.2.3


From d87e6575e9d1c9d43e223c3fe858e4e453265707 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 3 May 2023 17:24:24 +0200
Subject: btrfs: merge verify_parent_transid and btrfs_buffer_uptodate

verify_parent_transid is only called by btrfs_buffer_uptodate, which
confusingly inverts the return value.  Merge the two functions and
reflow the parent_transid so that error handling is in a branch.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 45 ++++++++++++++-------------------------------
 1 file changed, 14 insertions(+), 31 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 41045c900c2c..d4f7578cda00 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -110,32 +110,32 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
  * detect blocks that either didn't get written at all or got written
  * in the wrong place.
  */
-static int verify_parent_transid(struct extent_io_tree *io_tree,
-				 struct extent_buffer *eb, u64 parent_transid,
-				 int atomic)
+int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atomic)
 {
+	struct inode *btree_inode = eb->pages[0]->mapping->host;
+	struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
 	struct extent_state *cached_state = NULL;
-	int ret;
+	int ret = 1;
 
-	if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
+	if (!extent_buffer_uptodate(eb))
 		return 0;
 
+	if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
+		return 1;
+
 	if (atomic)
 		return -EAGAIN;
 
 	lock_extent(io_tree, eb->start, eb->start + eb->len - 1, &cached_state);
-	if (extent_buffer_uptodate(eb) &&
-	    btrfs_header_generation(eb) == parent_transid) {
-		ret = 0;
-		goto out;
-	}
-	btrfs_err_rl(eb->fs_info,
+	if (!extent_buffer_uptodate(eb) ||
+	    btrfs_header_generation(eb) != parent_transid) {
+		btrfs_err_rl(eb->fs_info,
 "parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
 			eb->start, eb->read_mirror,
 			parent_transid, btrfs_header_generation(eb));
-	ret = 1;
-	clear_extent_buffer_uptodate(eb);
-out:
+		clear_extent_buffer_uptodate(eb);
+		ret = 0;
+	}
 	unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
 		      &cached_state);
 	return ret;
@@ -4600,23 +4600,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
 	btrfs_close_devices(fs_info->fs_devices);
 }
 
-int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
-			  int atomic)
-{
-	int ret;
-	struct inode *btree_inode = buf->pages[0]->mapping->host;
-
-	ret = extent_buffer_uptodate(buf);
-	if (!ret)
-		return ret;
-
-	ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
-				    parent_transid, atomic);
-	if (ret == -EAGAIN)
-		return ret;
-	return !ret;
-}
-
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 {
 	struct btrfs_fs_info *fs_info = buf->fs_info;
-- 
cgit v1.2.3


From 046b562b20a5cfe205fb78c6f8a1a8b74f01d303 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 3 May 2023 17:24:28 +0200
Subject: btrfs: use a separate end_io handler for read_extent_buffer

Now that we always use a single bio to read an extent_buffer, the buffer
can be passed to the end_io handler as private data.  This allows
implementing a much simplified dedicated end I/O handler for metadata
reads.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c   | 105 +--------------------------------------------------
 fs/btrfs/disk-io.h   |   5 +--
 fs/btrfs/extent_io.c |  80 ++++++++++++++++++---------------------
 3 files changed, 41 insertions(+), 149 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d4f7578cda00..db04a957b445 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -427,8 +427,8 @@ static int check_tree_block_fsid(struct extent_buffer *eb)
 }
 
 /* Do basic extent buffer checks at read time */
-static int validate_extent_buffer(struct extent_buffer *eb,
-				  struct btrfs_tree_parent_check *check)
+int btrfs_validate_extent_buffer(struct extent_buffer *eb,
+				 struct btrfs_tree_parent_check *check)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
 	u64 found_start;
@@ -541,107 +541,6 @@ out:
 	return ret;
 }
 
-static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
-				   int mirror, struct btrfs_tree_parent_check *check)
-{
-	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
-	struct extent_buffer *eb;
-	bool reads_done;
-	int ret = 0;
-
-	ASSERT(check);
-
-	/*
-	 * We don't allow bio merge for subpage metadata read, so we should
-	 * only get one eb for each endio hook.
-	 */
-	ASSERT(end == start + fs_info->nodesize - 1);
-	ASSERT(PagePrivate(page));
-
-	eb = find_extent_buffer(fs_info, start);
-	/*
-	 * When we are reading one tree block, eb must have been inserted into
-	 * the radix tree. If not, something is wrong.
-	 */
-	ASSERT(eb);
-
-	reads_done = atomic_dec_and_test(&eb->io_pages);
-	/* Subpage read must finish in page read */
-	ASSERT(reads_done);
-
-	eb->read_mirror = mirror;
-	if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
-		ret = -EIO;
-		goto err;
-	}
-	ret = validate_extent_buffer(eb, check);
-	if (ret < 0)
-		goto err;
-
-	set_extent_buffer_uptodate(eb);
-
-	free_extent_buffer(eb);
-	return ret;
-err:
-	/*
-	 * end_bio_extent_readpage decrements io_pages in case of error,
-	 * make sure it has something to decrement.
-	 */
-	atomic_inc(&eb->io_pages);
-	clear_extent_buffer_uptodate(eb);
-	free_extent_buffer(eb);
-	return ret;
-}
-
-int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
-				   struct page *page, u64 start, u64 end,
-				   int mirror)
-{
-	struct extent_buffer *eb;
-	int ret = 0;
-	int reads_done;
-
-	ASSERT(page->private);
-
-	if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
-		return validate_subpage_buffer(page, start, end, mirror,
-					       &bbio->parent_check);
-
-	eb = (struct extent_buffer *)page->private;
-
-	/*
-	 * The pending IO might have been the only thing that kept this buffer
-	 * in memory.  Make sure we have a ref for all this other checks
-	 */
-	atomic_inc(&eb->refs);
-
-	reads_done = atomic_dec_and_test(&eb->io_pages);
-	if (!reads_done)
-		goto err;
-
-	eb->read_mirror = mirror;
-	if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
-		ret = -EIO;
-		goto err;
-	}
-	ret = validate_extent_buffer(eb, &bbio->parent_check);
-	if (!ret)
-		set_extent_buffer_uptodate(eb);
-err:
-	if (ret) {
-		/*
-		 * our io error hook is going to dec the io pages
-		 * again, we have to make sure it has something
-		 * to decrement
-		 */
-		atomic_inc(&eb->io_pages);
-		clear_extent_buffer_uptodate(eb);
-	}
-	free_extent_buffer(eb);
-
-	return ret;
-}
-
 #ifdef CONFIG_MIGRATION
 static int btree_migrate_folio(struct address_space *mapping,
 		struct folio *dst, struct folio *src, enum migrate_mode mode)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index a26ab3cbb449..b03767f4d7ed 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -82,9 +82,8 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
 void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
 void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
 				 struct btrfs_root *root);
-int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
-				   struct page *page, u64 start, u64 end,
-				   int mirror);
+int btrfs_validate_extent_buffer(struct extent_buffer *eb,
+				 struct btrfs_tree_parent_check *check);
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
 #endif
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ea99b2a3755e..4581ee5a0835 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -663,35 +663,6 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
 	btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
 }
 
-/*
- * Find extent buffer for a givne bytenr.
- *
- * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
- * in endio context.
- */
-static struct extent_buffer *find_extent_buffer_readpage(
-		struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
-{
-	struct extent_buffer *eb;
-
-	/*
-	 * For regular sectorsize, we can use page->private to grab extent
-	 * buffer
-	 */
-	if (fs_info->nodesize >= PAGE_SIZE) {
-		ASSERT(PagePrivate(page) && page->private);
-		return (struct extent_buffer *)page->private;
-	}
-
-	/* For subpage case, we need to lookup buffer radix tree */
-	rcu_read_lock();
-	eb = radix_tree_lookup(&fs_info->buffer_radix,
-			       bytenr >> fs_info->sectorsize_bits);
-	rcu_read_unlock();
-	ASSERT(eb);
-	return eb;
-}
-
 /*
  * after a readpage IO is done, we need to:
  * clear the uptodate bits on error
@@ -713,7 +684,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
 	 * larger than UINT_MAX, u32 here is enough.
 	 */
 	u32 bio_offset = 0;
-	int mirror;
 	struct bvec_iter_all iter_all;
 
 	ASSERT(!bio_flagged(bio, BIO_CLONED));
@@ -753,11 +723,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
 		end = start + bvec->bv_len - 1;
 		len = bvec->bv_len;
 
-		mirror = bbio->mirror_num;
-		if (uptodate && !is_data_inode(inode) &&
-		    btrfs_validate_metadata_buffer(bbio, page, start, end, mirror))
-			uptodate = false;
-
 		if (likely(uptodate)) {
 			loff_t i_size = i_size_read(inode);
 			pgoff_t end_index = i_size >> PAGE_SHIFT;
@@ -778,13 +743,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
 				zero_user_segment(page, zero_start,
 						  offset_in_page(end) + 1);
 			}
-		} else if (!is_data_inode(inode)) {
-			struct extent_buffer *eb;
-
-			eb = find_extent_buffer_readpage(fs_info, page, start);
-			set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
-			eb->read_mirror = mirror;
-			atomic_dec(&eb->io_pages);
 		}
 
 		/* Update page status and unlock. */
@@ -4221,6 +4179,42 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb)
 	}
 }
 
+static void extent_buffer_read_end_io(struct btrfs_bio *bbio)
+{
+	struct extent_buffer *eb = bbio->private;
+	bool uptodate = !bbio->bio.bi_status;
+	struct bvec_iter_all iter_all;
+	struct bio_vec *bvec;
+	u32 bio_offset = 0;
+
+	atomic_inc(&eb->refs);
+	eb->read_mirror = bbio->mirror_num;
+
+	if (uptodate &&
+	    btrfs_validate_extent_buffer(eb, &bbio->parent_check) < 0)
+		uptodate = false;
+
+	if (uptodate) {
+		set_extent_buffer_uptodate(eb);
+	} else {
+		clear_extent_buffer_uptodate(eb);
+		set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
+	}
+
+	bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
+		atomic_dec(&eb->io_pages);
+		end_page_read(bvec->bv_page, uptodate, eb->start + bio_offset,
+			      bvec->bv_len);
+		bio_offset += bvec->bv_len;
+	}
+
+	unlock_extent(&bbio->inode->io_tree, eb->start,
+		      eb->start + bio_offset - 1, NULL);
+	free_extent_buffer(eb);
+
+	bio_put(&bbio->bio);
+}
+
 static void __read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num,
 				       struct btrfs_tree_parent_check *check)
 {
@@ -4234,7 +4228,7 @@ static void __read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num,
 
 	bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
 			       REQ_OP_READ | REQ_META, eb->fs_info,
-			       end_bio_extent_readpage, NULL);
+			       extent_buffer_read_end_io, eb);
 	bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
 	bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
 	bbio->file_offset = eb->start;
-- 
cgit v1.2.3


From 31d89399dad0cf5524bd0e4e2c2827ae7004b2ca Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 3 May 2023 17:24:35 +0200
Subject: btrfs: remove the extent_buffer lookup in btree block checksumming

The checksumming of btree blocks always operates on the entire
extent_buffer, and because btree blocks are always allocated contiguously
on disk they are never split by btrfs_submit_bio.

Simplify the checksumming code by finding the extent_buffer in the
btrfs_bio private data instead of trying to search through the bio_vec.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 120 +++++++++++------------------------------------------
 1 file changed, 24 insertions(+), 96 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index db04a957b445..384c8110ef2a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -254,12 +254,34 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb,
 	return ret;
 }
 
-static int csum_one_extent_buffer(struct extent_buffer *eb)
+/*
+ * Checksum a dirty tree block before IO.
+ */
+blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
 {
+	struct extent_buffer *eb = bbio->private;
 	struct btrfs_fs_info *fs_info = eb->fs_info;
+	u64 found_start = btrfs_header_bytenr(eb);
 	u8 result[BTRFS_CSUM_SIZE];
 	int ret;
 
+	/* Btree blocks are always contiguous on disk. */
+	if (WARN_ON_ONCE(bbio->file_offset != eb->start))
+		return BLK_STS_IOERR;
+	if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len))
+		return BLK_STS_IOERR;
+
+	if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
+		WARN_ON_ONCE(found_start != 0);
+		return BLK_STS_OK;
+	}
+
+	if (WARN_ON_ONCE(found_start != eb->start))
+		return BLK_STS_IOERR;
+	if (WARN_ON(!btrfs_page_test_uptodate(fs_info, eb->pages[0], eb->start,
+					      eb->len)))
+		return BLK_STS_IOERR;
+
 	ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
 				    offsetof(struct btrfs_header, fsid),
 				    BTRFS_FSID_SIZE) == 0);
@@ -286,8 +308,7 @@ static int csum_one_extent_buffer(struct extent_buffer *eb)
 		goto error;
 	}
 	write_extent_buffer(eb, result, 0, fs_info->csum_size);
-
-	return 0;
+	return BLK_STS_OK;
 
 error:
 	btrfs_print_tree(eb, 0);
@@ -301,99 +322,6 @@ error:
 	 */
 	WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
 		btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID);
-	return ret;
-}
-
-/* Checksum all dirty extent buffers in one bio_vec */
-static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info,
-				      struct bio_vec *bvec)
-{
-	struct page *page = bvec->bv_page;
-	u64 bvec_start = page_offset(page) + bvec->bv_offset;
-	u64 cur;
-	int ret = 0;
-
-	for (cur = bvec_start; cur < bvec_start + bvec->bv_len;
-	     cur += fs_info->nodesize) {
-		struct extent_buffer *eb;
-		bool uptodate;
-
-		eb = find_extent_buffer(fs_info, cur);
-		uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
-						       fs_info->nodesize);
-
-		/* A dirty eb shouldn't disappear from buffer_radix */
-		if (WARN_ON(!eb))
-			return -EUCLEAN;
-
-		if (WARN_ON(cur != btrfs_header_bytenr(eb))) {
-			free_extent_buffer(eb);
-			return -EUCLEAN;
-		}
-		if (WARN_ON(!uptodate)) {
-			free_extent_buffer(eb);
-			return -EUCLEAN;
-		}
-
-		ret = csum_one_extent_buffer(eb);
-		free_extent_buffer(eb);
-		if (ret < 0)
-			return ret;
-	}
-	return ret;
-}
-
-/*
- * Checksum a dirty tree block before IO.  This has extra checks to make sure
- * we only fill in the checksum field in the first page of a multi-page block.
- * For subpage extent buffers we need bvec to also read the offset in the page.
- */
-static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec)
-{
-	struct page *page = bvec->bv_page;
-	u64 start = page_offset(page);
-	u64 found_start;
-	struct extent_buffer *eb;
-
-	if (fs_info->nodesize < PAGE_SIZE)
-		return csum_dirty_subpage_buffers(fs_info, bvec);
-
-	eb = (struct extent_buffer *)page->private;
-	if (page != eb->pages[0])
-		return 0;
-
-	found_start = btrfs_header_bytenr(eb);
-
-	if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
-		WARN_ON(found_start != 0);
-		return 0;
-	}
-
-	/*
-	 * Please do not consolidate these warnings into a single if.
-	 * It is useful to know what went wrong.
-	 */
-	if (WARN_ON(found_start != start))
-		return -EUCLEAN;
-	if (WARN_ON(!PageUptodate(page)))
-		return -EUCLEAN;
-
-	return csum_one_extent_buffer(eb);
-}
-
-blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
-{
-	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
-	struct bvec_iter iter;
-	struct bio_vec bv;
-	int ret = 0;
-
-	bio_for_each_segment(bv, &bbio->bio, iter) {
-		ret = csum_dirty_buffer(fs_info, &bv);
-		if (ret)
-			break;
-	}
-
 	return errno_to_blk_status(ret);
 }
 
-- 
cgit v1.2.3


From 9e2aff90fc2ad2b94933762f5442fef9ee7a692e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 3 May 2023 17:24:39 +0200
Subject: btrfs: stop using lock_extent in btrfs_buffer_uptodate

The only other place that locks extents on the btree inode is
read_extent_buffer_subpage while reading in the partial page for a
buffer.  This means locking the extent in btrfs_buffer_uptodate does not
synchronize with anything on non-subpage file systems, and on subpage
file systems it only waits for a parallel read(-ahead) to finish,
which seems to be counter to what the callers actually expect.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 384c8110ef2a..a3366fe546b1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -112,11 +112,6 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
  */
 int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atomic)
 {
-	struct inode *btree_inode = eb->pages[0]->mapping->host;
-	struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
-	struct extent_state *cached_state = NULL;
-	int ret = 1;
-
 	if (!extent_buffer_uptodate(eb))
 		return 0;
 
@@ -126,7 +121,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atom
 	if (atomic)
 		return -EAGAIN;
 
-	lock_extent(io_tree, eb->start, eb->start + eb->len - 1, &cached_state);
 	if (!extent_buffer_uptodate(eb) ||
 	    btrfs_header_generation(eb) != parent_transid) {
 		btrfs_err_rl(eb->fs_info,
@@ -134,11 +128,9 @@ int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atom
 			eb->start, eb->read_mirror,
 			parent_transid, btrfs_header_generation(eb));
 		clear_extent_buffer_uptodate(eb);
-		ret = 0;
+		return 0;
 	}
-	unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
-		      &cached_state);
-	return ret;
+	return 1;
 }
 
 static bool btrfs_supported_super_csum(u16 csum_type)
-- 
cgit v1.2.3


From 413fb1bc1d3224e508c698cd82e5ff1dc7d16d64 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Wed, 24 May 2023 20:02:39 +0800
Subject: btrfs: return bool from check_tree_block_fsid instead of int

Simplify the return type of check_tree_block_fsid() from int (1 or 0) to
bool. Its only user is interested in knowing the success or failure.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a3366fe546b1..8102d2aa3f62 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -317,7 +317,7 @@ error:
 	return errno_to_blk_status(ret);
 }
 
-static int check_tree_block_fsid(struct extent_buffer *eb)
+static bool check_tree_block_fsid(struct extent_buffer *eb)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
@@ -337,13 +337,13 @@ static int check_tree_block_fsid(struct extent_buffer *eb)
 		metadata_uuid = fs_devices->fsid;
 
 	if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE))
-		return 0;
+		return false;
 
 	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
 		if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
-			return 0;
+			return false;
 
-	return 1;
+	return true;
 }
 
 /* Do basic extent buffer checks at read time */
-- 
cgit v1.2.3


From 25984a5ae8f144e823468d39e483face534e45d1 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Wed, 24 May 2023 20:02:42 +0800
Subject: btrfs: consolidate uuid comparisons in btrfs_validate_super

There are three ways the fsid is validated in btrfs_validate_super():

- verify that super_copy::fsid is the same as fs_devices::fsid

- if the metadata_uuid flag is set, verify if super_copy::metadata_uuid
  and fs_devices::metadata_uuid are the same.

- a few lines below, often missed out, verify if dev_item::fsid is the
  same as fs_devices::metadata_uuid.

The function btrfs_validate_super() contains multiple if-statements with
memcmp() to check UUIDs. This patch consolidates them into a single
location.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8102d2aa3f62..6e0d4f99ab04 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2399,6 +2399,14 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info,
 		ret = -EINVAL;
 	}
 
+	if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
+		   BTRFS_FSID_SIZE) != 0) {
+		btrfs_err(fs_info,
+			"dev_item UUID does not match metadata fsid: %pU != %pU",
+			fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
+		ret = -EINVAL;
+	}
+
 	/*
 	 * Artificial requirement for block-group-tree to force newer features
 	 * (free-space-tree, no-holes) so the test matrix is smaller.
@@ -2411,14 +2419,6 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info,
 		ret = -EINVAL;
 	}
 
-	if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
-		   BTRFS_FSID_SIZE) != 0) {
-		btrfs_err(fs_info,
-			"dev_item UUID does not match metadata fsid: %pU != %pU",
-			fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
-		ret = -EINVAL;
-	}
-
 	/*
 	 * Hint to catch really bogus numbers, bitflips or so, more exact checks are
 	 * done later
-- 
cgit v1.2.3


From 85724171b302914bb8999b9df091fd4616a36eb7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 23 May 2023 10:40:18 +0200
Subject: btrfs: fix the btrfs_get_global_root return value

btrfs_grab_root returns either the root or NULL, and the callers of
btrfs_get_global_root expect it to return the same.  But all the more
recently added roots instead return an ERR_PTR, so fix this.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6e0d4f99ab04..4d347995fcaa 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1183,19 +1183,13 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
 	if (objectid == BTRFS_CSUM_TREE_OBJECTID)
 		return btrfs_grab_root(btrfs_global_root(fs_info, &key));
 	if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
-		return btrfs_grab_root(fs_info->quota_root) ?
-			fs_info->quota_root : ERR_PTR(-ENOENT);
+		return btrfs_grab_root(fs_info->quota_root);
 	if (objectid == BTRFS_UUID_TREE_OBJECTID)
-		return btrfs_grab_root(fs_info->uuid_root) ?
-			fs_info->uuid_root : ERR_PTR(-ENOENT);
+		return btrfs_grab_root(fs_info->uuid_root);
 	if (objectid == BTRFS_BLOCK_GROUP_TREE_OBJECTID)
-		return btrfs_grab_root(fs_info->block_group_root) ?
-			fs_info->block_group_root : ERR_PTR(-ENOENT);
-	if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
-		struct btrfs_root *root = btrfs_global_root(fs_info, &key);
-
-		return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT);
-	}
+		return btrfs_grab_root(fs_info->block_group_root);
+	if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
+		return btrfs_grab_root(btrfs_global_root(fs_info, &key));
 	return NULL;
 }
 
-- 
cgit v1.2.3


From e91909aace336a182da20bcbe507995ac24f74bb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 23 May 2023 10:40:19 +0200
Subject: btrfs: convert btrfs_get_global_root to use a switch statement

Use a switch statement instead of an endless chain of if statements
to make the code a little cleaner.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4d347995fcaa..c53c82cd0885 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1172,25 +1172,28 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
 		.offset = 0,
 	};
 
-	if (objectid == BTRFS_ROOT_TREE_OBJECTID)
+	switch (objectid) {
+	case BTRFS_ROOT_TREE_OBJECTID:
 		return btrfs_grab_root(fs_info->tree_root);
-	if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
+	case BTRFS_EXTENT_TREE_OBJECTID:
 		return btrfs_grab_root(btrfs_global_root(fs_info, &key));
-	if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
+	case BTRFS_CHUNK_TREE_OBJECTID:
 		return btrfs_grab_root(fs_info->chunk_root);
-	if (objectid == BTRFS_DEV_TREE_OBJECTID)
+	case BTRFS_DEV_TREE_OBJECTID:
 		return btrfs_grab_root(fs_info->dev_root);
-	if (objectid == BTRFS_CSUM_TREE_OBJECTID)
+	case BTRFS_CSUM_TREE_OBJECTID:
 		return btrfs_grab_root(btrfs_global_root(fs_info, &key));
-	if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
+	case BTRFS_QUOTA_TREE_OBJECTID:
 		return btrfs_grab_root(fs_info->quota_root);
-	if (objectid == BTRFS_UUID_TREE_OBJECTID)
+	case BTRFS_UUID_TREE_OBJECTID:
 		return btrfs_grab_root(fs_info->uuid_root);
-	if (objectid == BTRFS_BLOCK_GROUP_TREE_OBJECTID)
+	case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
 		return btrfs_grab_root(fs_info->block_group_root);
-	if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
+	case BTRFS_FREE_SPACE_TREE_OBJECTID:
 		return btrfs_grab_root(btrfs_global_root(fs_info, &key));
-	return NULL;
+	default:
+		return NULL;
+	}
 }
 
 int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
-- 
cgit v1.2.3


From 25ac047c9d3df373de35ba4ee3f0c902874a1fab Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 23 May 2023 10:40:20 +0200
Subject: btrfs: remove a pointless NULL check in btrfs_lookup_fs_root

btrfs_grab_root already checks for a NULL root itself.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c53c82cd0885..06dab9d018c2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1157,8 +1157,7 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
 	spin_lock(&fs_info->fs_roots_radix_lock);
 	root = radix_tree_lookup(&fs_info->fs_roots_radix,
 				 (unsigned long)root_id);
-	if (root)
-		root = btrfs_grab_root(root);
+	root = btrfs_grab_root(root);
 	spin_unlock(&fs_info->fs_roots_radix_lock);
 	return root;
 }
-- 
cgit v1.2.3


From 58e814fcacc1f652d2b794c82b7c9d96ee3c3bab Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 25 May 2023 13:33:08 -1000
Subject: btrfs: use alloc_ordered_workqueue() to create ordered workqueues

BACKGROUND
==========

When multiple work items are queued to a workqueue, their execution order
doesn't match the queueing order. They may get executed in any order and
simultaneously. When fully serialized execution - one by one in the queueing
order - is needed, an ordered workqueue should be used which can be created
with alloc_ordered_workqueue().

However, alloc_ordered_workqueue() was a later addition. Before it, an
ordered workqueue could be obtained by creating an UNBOUND workqueue with
@max_active==1. This originally was an implementation side-effect which was
broken by 4c16bd327c74 ("workqueue: restore WQ_UNBOUND/max_active==1 to be
ordered"). Because there were users that depended on the ordered execution,
5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered")
made workqueue allocation path to implicitly promote UNBOUND workqueues w/
@max_active==1 to ordered workqueues.

While this has worked okay, overloading the UNBOUND allocation interface
this way creates other issues. It's difficult to tell whether a given
workqueue actually needs to be ordered and users that legitimately want a
min concurrency level wq unexpectedly gets an ordered one instead. With
planned UNBOUND workqueue updates to improve execution locality and more
prevalence of chiplet designs which can benefit from such improvements, this
isn't a state we wanna be in forever.

This patch series audits all call sites that create an UNBOUND workqueue w/
@max_active==1 and converts them to alloc_ordered_workqueue() as necessary.

BTRFS
=====

* fs_info->scrub_workers initialized in scrub_workers_get() was setting
  @max_active to 1 when @is_dev_replace is set and it seems that the
  workqueue actually needs to be ordered if @is_dev_replace. Update the code
  so that alloc_ordered_workqueue() is used if @is_dev_replace.

* fs_info->discard_ctl.discard_workers initialized in
  btrfs_init_workqueues() was directly using alloc_workqueue() w/
  @max_active==1. Converted to alloc_ordered_workqueue().

* fs_info->fixup_workers and fs_info->qgroup_rescan_workers initialized in
  btrfs_queue_work() use the btrfs's workqueue wrapper, btrfs_workqueue,
  which are allocated with btrfs_alloc_workqueue().

  btrfs_workqueue implements automatic @max_active adjustment which is
  disabled when the specified max limit is below a certain threshold, so
  calling btrfs_alloc_workqueue() with @limit_active==1 yields an ordered
  workqueue whose @max_active won't be changed as the auto-tuning is
  disabled.

  This is rather brittle in that nothing clearly indicates that the two
  workqueues should be ordered or btrfs_alloc_workqueue() must disable
  auto-tuning when @limit_active==1.

  This patch factors out the common btrfs_workqueue init code into
  btrfs_init_workqueue() and add explicit btrfs_alloc_ordered_workqueue().
  The two workqueues are converted to use the new ordered allocation
  interface.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/async-thread.c | 44 +++++++++++++++++++++++++++++++++++++++-----
 fs/btrfs/async-thread.h |  3 +++
 fs/btrfs/disk-io.c      |  8 +++++---
 fs/btrfs/scrub.c        |  6 ++++--
 4 files changed, 51 insertions(+), 10 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index aac240430efe..ce083e99ef68 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -71,6 +71,16 @@ bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq)
 	return atomic_read(&wq->pending) > wq->thresh * 2;
 }
 
+static void btrfs_init_workqueue(struct btrfs_workqueue *wq,
+				 struct btrfs_fs_info *fs_info)
+{
+	wq->fs_info = fs_info;
+	atomic_set(&wq->pending, 0);
+	INIT_LIST_HEAD(&wq->ordered_list);
+	spin_lock_init(&wq->list_lock);
+	spin_lock_init(&wq->thres_lock);
+}
+
 struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
 					      const char *name, unsigned int flags,
 					      int limit_active, int thresh)
@@ -80,9 +90,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
 	if (!ret)
 		return NULL;
 
-	ret->fs_info = fs_info;
+	btrfs_init_workqueue(ret, fs_info);
+
 	ret->limit_active = limit_active;
-	atomic_set(&ret->pending, 0);
 	if (thresh == 0)
 		thresh = DFT_THRESHOLD;
 	/* For low threshold, disabling threshold is a better choice */
@@ -106,9 +116,33 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
 		return NULL;
 	}
 
-	INIT_LIST_HEAD(&ret->ordered_list);
-	spin_lock_init(&ret->list_lock);
-	spin_lock_init(&ret->thres_lock);
+	trace_btrfs_workqueue_alloc(ret, name);
+	return ret;
+}
+
+struct btrfs_workqueue *btrfs_alloc_ordered_workqueue(
+				struct btrfs_fs_info *fs_info, const char *name,
+				unsigned int flags)
+{
+	struct btrfs_workqueue *ret;
+
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	btrfs_init_workqueue(ret, fs_info);
+
+	/* Ordered workqueues don't allow @max_active adjustments. */
+	ret->limit_active = 1;
+	ret->current_active = 1;
+	ret->thresh = NO_THRESHOLD;
+
+	ret->normal_wq = alloc_ordered_workqueue("btrfs-%s", flags, name);
+	if (!ret->normal_wq) {
+		kfree(ret);
+		return NULL;
+	}
+
 	trace_btrfs_workqueue_alloc(ret, name);
 	return ret;
 }
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 6e2596ddae10..30f66c5e2e6e 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -31,6 +31,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
 					      unsigned int flags,
 					      int limit_active,
 					      int thresh);
+struct btrfs_workqueue *btrfs_alloc_ordered_workqueue(
+				struct btrfs_fs_info *fs_info, const char *name,
+				unsigned int flags);
 void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
 		     btrfs_func_t ordered_func, btrfs_func_t ordered_free);
 void btrfs_queue_work(struct btrfs_workqueue *wq,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 06dab9d018c2..1ad7037c6631 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1939,6 +1939,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
 {
 	u32 max_active = fs_info->thread_pool_size;
 	unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
+	unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE;
 
 	fs_info->workers =
 		btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
@@ -1955,7 +1956,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
 		btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
 
 	fs_info->fixup_workers =
-		btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
+		btrfs_alloc_ordered_workqueue(fs_info, "fixup", ordered_flags);
 
 	fs_info->endio_workers =
 		alloc_workqueue("btrfs-endio", flags, max_active);
@@ -1974,9 +1975,10 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
 		btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
 				      max_active, 0);
 	fs_info->qgroup_rescan_workers =
-		btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
+		btrfs_alloc_ordered_workqueue(fs_info, "qgroup-rescan",
+					      ordered_flags);
 	fs_info->discard_ctl.discard_workers =
-		alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
+		alloc_ordered_workqueue("btrfs_discard", WQ_FREEZABLE);
 
 	if (!(fs_info->workers &&
 	      fs_info->delalloc_workers && fs_info->flush_workers &&
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index d7a14458c4a7..316c5a0e3a44 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2740,8 +2740,10 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 	if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
 		return 0;
 
-	scrub_workers = alloc_workqueue("btrfs-scrub", flags,
-					is_dev_replace ? 1 : max_active);
+	if (is_dev_replace)
+		scrub_workers = alloc_ordered_workqueue("btrfs-scrub", flags);
+	else
+		scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active);
 	if (!scrub_workers)
 		goto fail_scrub_workers;
 
-- 
cgit v1.2.3


From 4d34ad34d7cc5390ec03b25f2a7f2fd5041cb7d8 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 29 May 2023 16:16:59 +0100
Subject: btrfs: remove pointless in_tree field from struct
 btrfs_delayed_ref_node

The 'in_tree' field is really not needed in struct btrfs_delayed_ref_node,
as we can check whether a reference is in the tree or not simply by
checking its red black tree node member with RB_EMPTY_NODE(), as when we
remove it from the tree we always call RB_CLEAR_NODE(). So remove that
field and use RB_EMPTY_NODE().

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c | 2 --
 fs/btrfs/delayed-ref.h | 3 +--
 fs/btrfs/disk-io.c     | 1 -
 fs/btrfs/extent-tree.c | 1 -
 4 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index be1d18ec5cef..d6dce5792c0f 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -407,7 +407,6 @@ static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs,
 	RB_CLEAR_NODE(&ref->ref_node);
 	if (!list_empty(&ref->add_list))
 		list_del(&ref->add_list);
-	ref->in_tree = 0;
 	btrfs_put_delayed_ref(ref);
 	atomic_dec(&delayed_refs->num_entries);
 }
@@ -853,7 +852,6 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
 	ref->num_bytes = num_bytes;
 	ref->ref_mod = 1;
 	ref->action = action;
-	ref->in_tree = 1;
 	ref->seq = seq;
 	ref->type = ref_type;
 	RB_CLEAR_NODE(&ref->ref_node);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 77b3e735772d..9b103bf27185 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -48,7 +48,6 @@ struct btrfs_delayed_ref_node {
 
 	unsigned int action:8;
 	unsigned int type:8;
-	unsigned int in_tree:1;
 };
 
 struct btrfs_delayed_extent_op {
@@ -341,7 +340,7 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
 {
 	WARN_ON(refcount_read(&ref->refs) == 0);
 	if (refcount_dec_and_test(&ref->refs)) {
-		WARN_ON(ref->in_tree);
+		WARN_ON(!RB_EMPTY_NODE(&ref->ref_node));
 		switch (ref->type) {
 		case BTRFS_TREE_BLOCK_REF_KEY:
 		case BTRFS_SHARED_BLOCK_REF_KEY:
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1ad7037c6631..55cbbe7b0dba 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4602,7 +4602,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 		while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
 			ref = rb_entry(n, struct btrfs_delayed_ref_node,
 				       ref_node);
-			ref->in_tree = 0;
 			rb_erase_cached(&ref->ref_node, &head->ref_tree);
 			RB_CLEAR_NODE(&ref->ref_node);
 			if (!list_empty(&ref->add_list))
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 71aaf28fafc9..81a673506077 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1913,7 +1913,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
 			return -EAGAIN;
 		}
 
-		ref->in_tree = 0;
 		rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
 		RB_CLEAR_NODE(&ref->ref_node);
 		if (!list_empty(&ref->add_list))
-- 
cgit v1.2.3


From 184533e3618f4d0b382c1ef3de0ce34e849005d7 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 29 May 2023 16:17:06 +0100
Subject: btrfs: remove unnecessary prototype declarations at disk-io.c

We have a few static functions at disk-io.c for which we have a forward
declaration of their prototype, but it's not needed because all those
functions are defined before they are called, so remove them.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 55cbbe7b0dba..c22f8f58c951 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -60,15 +60,6 @@
 				 BTRFS_SUPER_FLAG_METADUMP |\
 				 BTRFS_SUPER_FLAG_METADUMP_V2)
 
-static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
-static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
-				      struct btrfs_fs_info *fs_info);
-static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
-static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
-					struct extent_io_tree *dirty_pages,
-					int mark);
-static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
-				       struct extent_io_tree *pinned_extents);
 static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
 static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
 
-- 
cgit v1.2.3


From 99f09ce309b8307ce8dca209f936e99a7c332214 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 2 Jun 2023 12:19:42 +0100
Subject: btrfs: make btrfs_destroy_delayed_refs() return void

btrfs_destroy_delayed_refs() always returns 0 and its single caller does
not check its return value, as it also returns void, and so does the
callers' caller and so on. This is because we are in the transaction abort
path, where we have no way to deal with errors (we are in a critical
situation) and all cleanup of resources works in a best effort fashion.
So make btrfs_destroy_delayed_refs() return void.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c22f8f58c951..4328c6b1120e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4562,13 +4562,12 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
 	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
 }
 
-static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
-				      struct btrfs_fs_info *fs_info)
+static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+				       struct btrfs_fs_info *fs_info)
 {
 	struct rb_node *node;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_delayed_ref_node *ref;
-	int ret = 0;
 
 	delayed_refs = &trans->delayed_refs;
 
@@ -4576,7 +4575,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 	if (atomic_read(&delayed_refs->num_entries) == 0) {
 		spin_unlock(&delayed_refs->lock);
 		btrfs_debug(fs_info, "delayed_refs has NO entry");
-		return ret;
+		return;
 	}
 
 	while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
@@ -4637,8 +4636,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 	btrfs_qgroup_destroy_extent_records(trans);
 
 	spin_unlock(&delayed_refs->lock);
-
-	return ret;
 }
 
 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
-- 
cgit v1.2.3


From efcfcbc6a36195c42d98e0ee697baba36da94dc8 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 4 Apr 2023 00:06:02 +0200
Subject: btrfs: add xxhash to fast checksum implementations

The implementation of XXHASH is now CPU only but still fast enough to be
considered for the synchronous checksumming, like non-generic crc32c.

A userspace benchmark comparing it to various implementations (patched
hash-speedtest from btrfs-progs):

  Block size:     4096
  Iterations:     1000000
  Implementation: builtin
  Units:          CPU cycles

	NULL-NOP: cycles:     73384294, cycles/i       73
     NULL-MEMCPY: cycles:    228033868, cycles/i      228,    61664.320 MiB/s
      CRC32C-ref: cycles:  24758559416, cycles/i    24758,      567.950 MiB/s
       CRC32C-NI: cycles:   1194350470, cycles/i     1194,    11773.433 MiB/s
  CRC32C-ADLERSW: cycles:   6150186216, cycles/i     6150,     2286.372 MiB/s
  CRC32C-ADLERHW: cycles:    626979180, cycles/i      626,    22427.453 MiB/s
      CRC32C-PCL: cycles:    466746732, cycles/i      466,    30126.699 MiB/s
	  XXHASH: cycles:    860656400, cycles/i      860,    16338.188 MiB/s

Comparing purely software implementation (ref), current outdated
accelerated using crc32q instruction (NI), optimized implementations by
M. Adler (https://stackoverflow.com/questions/17645167/implementing-sse-4-2s-crc32c-in-software/17646775#17646775)
and the best one that was taken from kernel using the PCLMULQDQ
instruction (PCL).

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4328c6b1120e..7513388b0567 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2011,6 +2011,9 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
 		if (!strstr(crypto_shash_driver_name(csum_shash), "generic"))
 			set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
 		break;
+	case BTRFS_CSUM_TYPE_XXHASH:
+		set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
+		break;
 	default:
 		break;
 	}
-- 
cgit v1.2.3