Merge tag 'for-7.2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs updates from David Sterba: "The most noticeable change is to enable large folios by default, it's been in testing for a few releases. Related to that is huge folio support (still under experimental config). Otherwise a few ioctl updates, performance improvements and usual fixes and core changes. User visible changes: - enable large folios by default, added in 6.17 (under experimental build), no feature limitations, a big change internally - new ioctl to return raw checksums to userspace (a bit tricky given compression and tail extents), can be used for mkfs and deduplication optimizations - provide stable UUID for e.g. overlayfs and temp_fsid, also reflected in statvfs() field f_fsid, internal dev_t is hashed in to allow cloning - add 32bit compat version of GET_SUBVOL_INFO ioctl - in experimental build, support huge folios (up to 2M) Performance related improvements/changes: - limit bio size to the estimated optimum derived from the queue, this prevents build up of too much data for writeback, which could cause latency spikes (reported improvement 15% on sequential writes) - don't force direct IO to be serialized, forgotten change during mount API port, brings back +60% of throughput - lockless calculation of number of shrinkable extent maps, improve performance with many memcg allocated objects Notable fixes: - in zoned mode, fix a deadlock due to zone reclaim and relocation when space needs to be flushed - don't trim device which is internally not tracked as writeable (e.g. when missing device is being rescanned) - fix deadlock when cloning inline extent and mounted with flushoncommit - fix false IO failures after direct IO falls back to buffered write in some cases Core: - remove COW fixup mechanism completely; detect and fix changes to pages outside of filesystem tracking, guaranteed since 5.8, grace period is over - remove 2K block size support, experimental to test subpage code on x86_64 but now it would block folio changes - tree-checker improvements of: - free-space cache and tree items - root reference and backref items - extent state exceptions in reloc tree - subpage mode updates: - code optimizations, simplify tracking bitmaps - re-enable readahead of compressed extent - extend bitmap size to cover huge folios - add tracepoints related to sync, tree-log and transactions - device stats item tracking unification, remove item if there are no stats recorded, also don't leave stale stats on replaced device - allow extent buffer pages to be allocated as movable, to help page migration - added checks for proper extent buffer release - btrfs.ko code size reduction due to transaction abort call simplifications - several struct size reductions - more auto free conversions - more verbose assertions" * tag 'for-7.2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (130 commits) btrfs: fix use-after-free after relocation failure with concurrent COW btrfs: move WARN_ON on unexpected error in __add_tree_block() btrfs: move locking into btrfs_get_reloc_bg_bytenr() btrfs: lzo: reject compressed segment that overflows the compressed input btrfs: retry faulting in the pages after a zero sized short direct write btrfs: fix incorrect buffered IO fallback for append direct writes btrfs: fix false IO failure after falling back to buffered write btrfs: use verbose assertions in backref.c btrfs: print a message when a missing device re-appears btrfs: do not trim a device which is not writeable btrfs: return real error after lookup failure in btrfs_ioctl_default_subvol() btrfs: use mapping shared locking for reading super block btrfs: use lockless read in nr_cached_objects shrinker callback btrfs: switch local indicator variables to bools btrfs: send: pass bool for pending_move and refs_processed parameters btrfs: use shifts for sectorsize and nodesize btrfs: fix deadlock cloning inline extent when using flushoncommit btrfs: allocate eb-attached btree pages as movable btrfs: add 32-bit compat ioctl for BTRFS_IOC_GET_SUBVOL_INFO btrfs: derive f_fsid from on-disk fsid and dev_t ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2026-06-16 09:38:02 +0300
committer: Linus Torvalds <torvalds@linux-foundation.org> 2026-06-16 09:38:02 +0300
commit: 31b706da2cfd8ee3352391181ccf9696bed3d25d (patch)
tree: c76b0a7317b3b58f5638b9c7cabd8f640ce5be56
parent: 477c122f8c1d5d9f57c4f9c1f7a1631beaa38bcc (diff)
parent: ae2eb64bfd9762536f60b690840adcdf622cdcce (diff)
download: linux-31b706da2cfd8ee3352391181ccf9696bed3d25d.tar.xz
58 files changed, 3475 insertions, 1904 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 5e75438e0b73..9de04c37e11a 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -93,10 +93,6 @@ config BTRFS_EXPERIMENTAL
 
 	  Current list:
 
-	  - COW fixup worker warning - last warning before removing the
-				       functionality catching out-of-band page
-				       dirtying, not necessary since 5.8
-
 	  - RAID mirror read policy - additional read policies for balancing
 				      reading from redundant block group
 				      profiles (currently: pid, round-robin,
@@ -110,7 +106,9 @@ config BTRFS_EXPERIMENTAL
 
 	  - extent tree v2 - complex rework of extent tracking
 
-	  - large folio and block size (> page size) support
+	  - block size > page size support
+
+	  - huge folios for data - folios can be as large as 2MiB now
 
 	  - asynchronous checksum generation for data writes
 
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 273924ca912c..23c3eeb58dc1 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1256,7 +1256,7 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct
 	 * realizing. We cache results only for extent buffers that lead from
 	 * the root node down to the leaf with the file extent item.
 	 */
-	ASSERT(level >= 0);
+	ASSERT(level >= 0, "level=%d", level);
 
 	entry = &ctx->path_cache_entries[level];
 
@@ -1327,7 +1327,7 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx
 	 * realizing. We cache results only for extent buffers that lead from
 	 * the root node down to the leaf with the file extent item.
 	 */
-	ASSERT(level >= 0);
+	ASSERT(level >= 0, "level=%d", level);
 
 	if (is_shared)
 		gen = btrfs_get_last_root_drop_gen(fs_info);
@@ -2367,7 +2367,7 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
 		info = (struct btrfs_tree_block_info *)(ei + 1);
 		*out_level = btrfs_tree_block_level(eb, info);
 	} else {
-		ASSERT(key->type == BTRFS_METADATA_ITEM_KEY);
+		ASSERT(key->type == BTRFS_METADATA_ITEM_KEY, "key->type=%hhu", key->type);
 		*out_level = (u8)key->offset;
 	}
 
@@ -2814,26 +2814,17 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
 	return ifp;
 }
 
-struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info)
+int btrfs_backref_iter_init(struct btrfs_backref_iter *iter)
 {
-	struct btrfs_backref_iter *ret;
-
-	ret = kzalloc_obj(*ret, GFP_NOFS);
-	if (!ret)
-		return NULL;
-
-	ret->path = btrfs_alloc_path();
-	if (!ret->path) {
-		kfree(ret);
-		return NULL;
-	}
+	iter->path = btrfs_alloc_path();
+	if (!iter->path)
+		return -ENOMEM;
 
 	/* Current backref iterator only supports iteration in commit root */
-	ret->path->search_commit_root = true;
-	ret->path->skip_locking = true;
-	ret->fs_info = fs_info;
+	iter->path->search_commit_root = true;
+	iter->path->skip_locking = true;
 
-	return ret;
+	return 0;
 }
 
 static void btrfs_backref_iter_release(struct btrfs_backref_iter *iter)
@@ -2846,9 +2837,8 @@ static void btrfs_backref_iter_release(struct btrfs_backref_iter *iter)
 	memset(&iter->cur_key, 0, sizeof(iter->cur_key));
 }
 
-int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
+int btrfs_backref_iter_start(struct btrfs_fs_info *fs_info, struct btrfs_backref_iter *iter, u64 bytenr)
 {
-	struct btrfs_fs_info *fs_info = iter->fs_info;
 	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
 	struct btrfs_path *path = iter->path;
 	struct btrfs_extent_item *ei;
@@ -2963,7 +2953,7 @@ static bool btrfs_backref_iter_is_inline_ref(struct btrfs_backref_iter *iter)
  * Return >0 if there is no extra backref for this bytenr.
  * Return <0 if there is something wrong happened.
  */
-int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
+int btrfs_backref_iter_next(struct btrfs_fs_info *fs_info, struct btrfs_backref_iter *iter)
 {
 	struct extent_buffer *eb = iter->path->nodes[0];
 	struct btrfs_root *extent_root;
@@ -2974,7 +2964,9 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
 
 	if (btrfs_backref_iter_is_inline_ref(iter)) {
 		/* We're still inside the inline refs */
-		ASSERT(iter->cur_ptr < iter->end_ptr);
+		ASSERT(iter->cur_ptr < iter->end_ptr,
+		       "iter->cur_ptr=%u iter->end_ptr=%u",
+		       iter->cur_ptr, iter->end_ptr);
 
 		if (btrfs_backref_has_tree_block_info(iter)) {
 			/* First tree block info */
@@ -2997,10 +2989,9 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
 	}
 
 	/* We're at keyed items, there is no inline item, go to the next one */
-	extent_root = btrfs_extent_root(iter->fs_info, iter->bytenr);
+	extent_root = btrfs_extent_root(fs_info, iter->bytenr);
 	if (unlikely(!extent_root)) {
-		btrfs_err(iter->fs_info,
-			  "missing extent root for extent at bytenr %llu",
+		btrfs_err(fs_info, "missing extent root for extent at bytenr %llu",
 			  iter->bytenr);
 		return -EUCLEAN;
 	}
@@ -3041,7 +3032,7 @@ struct btrfs_backref_node *btrfs_backref_alloc_node(
 {
 	struct btrfs_backref_node *node;
 
-	ASSERT(level >= 0 && level < BTRFS_MAX_LEVEL);
+	ASSERT(level >= 0 && level < BTRFS_MAX_LEVEL, "level=%d", level);
 	node = kzalloc_obj(*node, GFP_NOFS);
 	if (!node)
 		return node;
@@ -3063,7 +3054,7 @@ void btrfs_backref_free_node(struct btrfs_backref_cache *cache,
 	if (node) {
 		ASSERT(list_empty(&node->list));
 		ASSERT(list_empty(&node->lower));
-		ASSERT(node->eb == NULL);
+		ASSERT(node->eb == NULL, "node->eb->start=%llu", node->eb->start);
 		cache->nr_nodes--;
 		btrfs_put_root(node->root);
 		kfree(node);
@@ -3166,15 +3157,18 @@ void btrfs_backref_release_cache(struct btrfs_backref_cache *cache)
 
 	ASSERT(list_empty(&cache->pending_edge));
 	ASSERT(list_empty(&cache->useless_node));
-	ASSERT(!cache->nr_nodes);
-	ASSERT(!cache->nr_edges);
+	ASSERT(!cache->nr_nodes, "cache->nr_nodes=%d", cache->nr_nodes);
+	ASSERT(!cache->nr_edges, "cache->nr_edges=%d", cache->nr_edges);
 }
 
 static void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
 				    struct btrfs_backref_node *lower,
 				    struct btrfs_backref_node *upper)
 {
-	ASSERT(upper && lower && upper->level == lower->level + 1);
+	ASSERT(upper != NULL);
+	ASSERT(lower != NULL);
+	ASSERT(upper->level == lower->level + 1, "upper->level=%d lower->level=%d",
+	       upper->level, lower->level);
 	edge->node[LOWER] = lower;
 	edge->node[UPPER] = upper;
 	list_add_tail(&edge->list[LOWER], &lower->upper);
@@ -3199,7 +3193,7 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache,
 	struct btrfs_backref_node *upper;
 	struct rb_node *rb_node;
 
-	ASSERT(ref_key->type == BTRFS_SHARED_BLOCK_REF_KEY);
+	ASSERT(ref_key->type == BTRFS_SHARED_BLOCK_REF_KEY, "ref_key->type=%hhu", ref_key->type);
 
 	/* Only reloc root uses backref pointing to itself */
 	if (ref_key->objectid == ref_key->offset) {
@@ -3294,7 +3288,9 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
 
 	if (btrfs_root_level(&root->root_item) == cur->level) {
 		/* Tree root */
-		ASSERT(btrfs_root_bytenr(&root->root_item) == cur->bytenr);
+		ASSERT(btrfs_root_bytenr(&root->root_item) == cur->bytenr,
+		       "root_bytenr=%llu cur->bytenr=%llu",
+		       btrfs_root_bytenr(&root->root_item), cur->bytenr);
 		/*
 		 * For reloc backref cache, we may ignore reloc root.  But for
 		 * general purpose backref cache, we can't rely on
@@ -3344,8 +3340,9 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
 	/* Add all nodes and edges in the path */
 	for (; level < BTRFS_MAX_LEVEL; level++) {
 		if (!path->nodes[level]) {
-			ASSERT(btrfs_root_bytenr(&root->root_item) ==
-			       lower->bytenr);
+			ASSERT(btrfs_root_bytenr(&root->root_item) == lower->bytenr,
+			       "root_bytenr=%llu lower->bytenr=%llu",
+			       btrfs_root_bytenr(&root->root_item), lower->bytenr);
 			/* Same as previous should_ignore_reloc_root() call */
 			if (btrfs_should_ignore_reloc_root(root) &&
 			    cache->is_reloc) {
@@ -3454,7 +3451,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans,
 	struct btrfs_backref_node *exist;
 	int ret;
 
-	ret = btrfs_backref_iter_start(iter, cur->bytenr);
+	ret = btrfs_backref_iter_start(trans->fs_info, iter, cur->bytenr);
 	if (ret < 0)
 		return ret;
 	/*
@@ -3462,7 +3459,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans,
 	 * stored in it, but fetch it from the tree block
 	 */
 	if (btrfs_backref_has_tree_block_info(iter)) {
-		ret = btrfs_backref_iter_next(iter);
+		ret = btrfs_backref_iter_next(trans->fs_info, iter);
 		if (ret < 0)
 			goto out;
 		/* No extra backref? This means the tree block is corrupted */
@@ -3492,7 +3489,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans,
 		exist = NULL;
 	}
 
-	for (; ret == 0; ret = btrfs_backref_iter_next(iter)) {
+	for (; ret == 0; ret = btrfs_backref_iter_next(trans->fs_info, iter)) {
 		struct extent_buffer *eb;
 		struct btrfs_key key;
 		int type;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 1d009b0f4c69..179791de6b19 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -278,15 +278,12 @@ struct prelim_ref {
 struct btrfs_backref_iter {
 	u64 bytenr;
 	struct btrfs_path *path;
-	struct btrfs_fs_info *fs_info;
 	struct btrfs_key cur_key;
 	u32 item_ptr;
 	u32 cur_ptr;
 	u32 end_ptr;
 };
 
-struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info);
-
 /*
  * For metadata with EXTENT_ITEM key (non-skinny) case, the first inline data
  * is btrfs_tree_block_info, without a btrfs_extent_inline_ref header.
@@ -302,9 +299,11 @@ static inline bool btrfs_backref_has_tree_block_info(
 	return false;
 }
 
-int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr);
+int btrfs_backref_iter_init(struct btrfs_backref_iter *iter);
+
+int btrfs_backref_iter_start(struct btrfs_fs_info *fs_info, struct btrfs_backref_iter *iter, u64 bytenr);
 
-int btrfs_backref_iter_next(struct btrfs_backref_iter *iter);
+int btrfs_backref_iter_next(struct btrfs_fs_info *fs_info, struct btrfs_backref_iter *iter);
 
 /*
  * Backref cache related structures
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index b611c64119db..ab76a5173272 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -22,6 +22,34 @@
 #include "accessors.h"
 #include "extent-tree.h"
 
+static struct kmem_cache *block_group_cache;
+static struct kmem_cache *free_space_ctl_cache;
+
+int __init btrfs_init_block_group(void)
+{
+	block_group_cache = kmem_cache_create("btrfs_block_group",
+					      sizeof(struct btrfs_block_group),
+					      0, 0, NULL);
+	if (!block_group_cache)
+		return -ENOMEM;
+
+	free_space_ctl_cache = kmem_cache_create("btrfs_free_space_ctl",
+						 sizeof(struct btrfs_free_space_ctl),
+						 0, 0, NULL);
+	if (!free_space_ctl_cache) {
+		kmem_cache_destroy(block_group_cache);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void __cold btrfs_exit_block_group(void)
+{
+	kmem_cache_destroy(block_group_cache);
+	kmem_cache_destroy(free_space_ctl_cache);
+}
+
 #ifdef CONFIG_BTRFS_DEBUG
 int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group)
 {
@@ -180,9 +208,9 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
 			btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
 						  cache);
 
-		kfree(cache->free_space_ctl);
+		kmem_cache_free(free_space_ctl_cache, cache->free_space_ctl);
 		btrfs_free_chunk_map(cache->physical_map);
-		kfree(cache);
+		kmem_cache_free(block_group_cache, cache);
 	}
 }
 
@@ -2371,13 +2399,13 @@ static struct btrfs_block_group *btrfs_create_block_group(
 {
 	struct btrfs_block_group *cache;
 
-	cache = kzalloc_obj(*cache, GFP_NOFS);
+	cache = kmem_cache_zalloc(block_group_cache, GFP_NOFS);
 	if (!cache)
 		return NULL;
 
-	cache->free_space_ctl = kzalloc_obj(*cache->free_space_ctl, GFP_NOFS);
+	cache->free_space_ctl = kmem_cache_zalloc(free_space_ctl_cache, GFP_NOFS);
 	if (!cache->free_space_ctl) {
-		kfree(cache);
+		kmem_cache_free(block_group_cache, cache);
 		return NULL;
 	}
 
@@ -2454,7 +2482,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
 static int read_one_block_group(struct btrfs_fs_info *info,
 				struct btrfs_block_group_item_v2 *bgi,
 				const struct btrfs_key *key,
-				int need_clear)
+				bool need_clear)
 {
 	struct btrfs_block_group *cache;
 	const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
@@ -2635,7 +2663,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
 	struct btrfs_block_group *cache;
 	struct btrfs_space_info *space_info;
 	struct btrfs_key key;
-	int need_clear = 0;
+	bool need_clear = false;
 	u64 cache_gen;
 
 	/*
@@ -2660,9 +2688,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
 	cache_gen = btrfs_super_cache_generation(info->super_copy);
 	if (btrfs_test_opt(info, SPACE_CACHE) &&
 	    btrfs_super_generation(info->super_copy) != cache_gen)
-		need_clear = 1;
+		need_clear = true;
 	if (btrfs_test_opt(info, CLEAR_CACHE))
-		need_clear = 1;
+		need_clear = true;
 
 	while (1) {
 		struct btrfs_block_group_item_v2 bgi;
@@ -4089,7 +4117,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
 	struct btrfs_space_info *space_info;
 
 	space_info = btrfs_find_space_info(trans->fs_info, type);
-	if (!space_info) {
+	if (unlikely(!space_info)) {
 		DEBUG_WARN();
 		return -EINVAL;
 	}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 0504cb357992..790c2d467af5 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -122,6 +122,7 @@ struct btrfs_block_group {
 	struct btrfs_fs_info *fs_info;
 	struct btrfs_inode *inode;
 	spinlock_t lock;
+	unsigned int ro;
 	u64 start;
 	u64 length;
 	u64 pinned;
@@ -134,7 +135,8 @@ struct btrfs_block_group {
 	u64 global_root_id;
 	u64 remap_bytes;
 	u32 identity_remap_count;
-
+	/* The last commited identity_remap_count value of this block group. */
+	u32 last_identity_remap_count;
 	/*
 	 * The last committed used bytes of this block group, if the above @used
 	 * is still the same as @last_used, we don't need to update block
@@ -143,8 +145,6 @@ struct btrfs_block_group {
 	u64 last_used;
 	/* The last committed remap_bytes value of this block group. */
 	u64 last_remap_bytes;
-	/* The last commited identity_remap_count value of this block group. */
-	u32 last_identity_remap_count;
 	/* The last committed flags value for this block group. */
 	u64 last_flags;
 
@@ -171,12 +171,10 @@ struct btrfs_block_group {
 	unsigned long full_stripe_len;
 	unsigned long runtime_flags;
 
-	unsigned int ro;
-
-	int disk_cache_state;
+	enum btrfs_disk_cache_state disk_cache_state;
 
 	/* Cache tracking stuff */
-	int cached;
+	enum btrfs_caching_type cached;
 	struct btrfs_caching_control *caching_ctl;
 
 	struct btrfs_space_info *space_info;
@@ -193,6 +191,16 @@ struct btrfs_block_group {
 	refcount_t refs;
 
 	/*
+	 * When non-zero it means the block group's logical address and its
+	 * device extents can not be reused for future block group allocations
+	 * until the counter goes down to 0. This is to prevent them from being
+	 * reused while some task is still using the block group after it was
+	 * deleted - we want to make sure they can only be reused for new block
+	 * groups after that task is done with the deleted block group.
+	 */
+	atomic_t frozen;
+
+	/*
 	 * List of struct btrfs_free_clusters for this block group.
 	 * Today it will only have one thing on it, but that may change
 	 */
@@ -211,22 +219,12 @@ struct btrfs_block_group {
 	/* For read-only block groups */
 	struct list_head ro_list;
 
-	/*
-	 * When non-zero it means the block group's logical address and its
-	 * device extents can not be reused for future block group allocations
-	 * until the counter goes down to 0. This is to prevent them from being
-	 * reused while some task is still using the block group after it was
-	 * deleted - we want to make sure they can only be reused for new block
-	 * groups after that task is done with the deleted block group.
-	 */
-	atomic_t frozen;
-
 	/* For discard operations */
 	struct list_head discard_list;
 	int discard_index;
+	enum btrfs_discard_state discard_state;
 	u64 discard_eligible_time;
 	u64 discard_cursor;
-	enum btrfs_discard_state discard_state;
 
 	/* For dirty block groups */
 	struct list_head dirty_list;
@@ -263,6 +261,8 @@ struct btrfs_block_group {
 	/* Protected by @free_space_lock. */
 	bool using_free_space_bitmaps_cached;
 
+	enum btrfs_block_group_size_class size_class:8;
+
 	/*
 	 * Number of extents in this block group used for swap files.
 	 * All accesses protected by the spinlock 'lock'.
@@ -281,7 +281,6 @@ struct btrfs_block_group {
 	struct list_head active_bg_list;
 	struct work_struct zone_finish_work;
 	struct extent_buffer *last_eb;
-	enum btrfs_block_group_size_class size_class;
 	u64 reclaim_mark;
 };
 
@@ -320,6 +319,9 @@ static inline u64 btrfs_block_group_available_space(const struct btrfs_block_gro
 int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group);
 #endif
 
+int __init btrfs_init_block_group(void);
+void __cold btrfs_exit_block_group(void);
+
 struct btrfs_block_group *btrfs_lookup_first_block_group(
 		struct btrfs_fs_info *info, u64 bytenr);
 struct btrfs_block_group *btrfs_lookup_block_group(
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 55c272fe5d92..d5d81f9546c3 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -128,15 +128,6 @@ struct btrfs_inode {
 	/* which subvolume this inode belongs to */
 	struct btrfs_root *root;
 
-#if BITS_PER_LONG == 32
-	/*
-	 * The objectid of the corresponding BTRFS_INODE_ITEM_KEY.
-	 * On 64 bits platforms we can get it from vfs_inode.i_ino, which is an
-	 * unsigned long and therefore 64 bits on such platforms.
-	 */
-	u64 objectid;
-#endif
-
 	/* Cached value of inode property 'compression'. */
 	u8 prop_compress;
 
@@ -372,30 +363,11 @@ static inline unsigned long btrfs_inode_hash(u64 objectid,
 	return (unsigned long)h;
 }
 
-#if BITS_PER_LONG == 32
-
-/*
- * On 32 bit systems the i_ino of struct inode is 32 bits (unsigned long), so
- * we use the inode's location objectid which is a u64 to avoid truncation.
- */
-static inline u64 btrfs_ino(const struct btrfs_inode *inode)
-{
-	u64 ino = inode->objectid;
-
-	if (test_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags))
-		ino = inode->vfs_inode.i_ino;
-	return ino;
-}
-
-#else
-
 static inline u64 btrfs_ino(const struct btrfs_inode *inode)
 {
 	return inode->vfs_inode.i_ino;
 }
 
-#endif
-
 static inline void btrfs_get_inode_key(const struct btrfs_inode *inode,
 				       struct btrfs_key *key)
 {
@@ -406,9 +378,6 @@ static inline void btrfs_get_inode_key(const struct btrfs_inode *inode,
 
 static inline void btrfs_set_inode_number(struct btrfs_inode *inode, u64 ino)
 {
-#if BITS_PER_LONG == 32
-	inode->objectid = ino;
-#endif
 	inode->vfs_inode.i_ino = ino;
 }
 
@@ -531,12 +500,9 @@ static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode)
 	/* Metadata inode should not reach here. */
 	ASSERT(is_data_inode(inode));
 
-	/* We only allow BITS_PER_LONGS blocks for each bitmap. */
-#ifdef CONFIG_BTRFS_EXPERIMENTAL
 	mapping_set_folio_order_range(inode->vfs_inode.i_mapping,
 				      inode->root->fs_info->block_min_order,
 				      inode->root->fs_info->block_max_order);
-#endif
 }
 
 void btrfs_calculate_block_csum_folio(struct btrfs_fs_info *fs_info,
@@ -569,6 +535,8 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
 int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
 			      unsigned int extra_bits,
 			      struct extent_state **cached_state);
+int btrfs_reset_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
+				unsigned int extra_bits, struct extent_state **cached_state);
 
 struct btrfs_new_inode_args {
 	/* Input */
@@ -630,7 +598,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
 				    loff_t actual_len, u64 *alloc_hint);
 int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio,
 			     u64 start, u64 end, struct writeback_control *wbc);
-int btrfs_writepage_cow_fixup(struct folio *folio);
 int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
 					     int compress_type);
 int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index a02b62e0a8f3..ffb6b52863a7 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -355,21 +355,16 @@ struct compressed_bio *btrfs_alloc_compressed_write(struct btrfs_inode *inode,
 }
 
 /*
- * Add extra pages in the same compressed file extent so that we don't need to
+ * Add extra folios in the same compressed file extent so that we don't need to
  * re-read the same extent again and again.
  *
- * NOTE: this won't work well for subpage, as for subpage read, we lock the
- * full page then submit bio for each compressed/regular extents.
- *
- * This means, if we have several sectors in the same page points to the same
- * on-disk compressed data, we will re-read the same extent many times and
- * this function can only help for the next page.
+ * If in the same folio, we have several non-contiguous blocks which are pointing
+ * to the same on-disk compressed data, we will re-read the same extent many
+ * times, as this function can only help cross folio situations.
  */
-static noinline int add_ra_bio_pages(struct inode *inode,
-				     u64 compressed_end,
-				     struct compressed_bio *cb,
-				     int *memstall, unsigned long *pflags,
-				     bool direct_reclaim)
+static noinline int add_ra_bio_folios(struct inode *inode, u64 compressed_end,
+				      struct compressed_bio *cb, int *memstall,
+				      unsigned long *pflags, bool direct_reclaim)
 {
 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	pgoff_t end_index;
@@ -391,16 +386,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 	if (isize == 0)
 		return 0;
 
-	/*
-	 * For current subpage support, we only support 64K page size,
-	 * which means maximum compressed extent size (128K) is just 2x page
-	 * size.
-	 * This makes readahead less effective, so here disable readahead for
-	 * subpage for now, until full compressed write is supported.
-	 */
-	if (fs_info->sectorsize < PAGE_SIZE)
-		return 0;
-
 	/* For bs > ps cases, we don't support readahead for compressed folios for now. */
 	if (fs_info->block_min_order)
 		return 0;
@@ -416,7 +401,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 	}
 
 	while (cur < compressed_end) {
-		pgoff_t page_end;
+		u64 folio_end;
 		pgoff_t pg_index = cur >> PAGE_SHIFT;
 		gfp_t masked_constraint_gfp;
 		u32 add_size;
@@ -438,8 +423,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 				break;
 
 			/*
-			 * Jump to next page start as we already have page for
-			 * current offset.
+			 * Jump to the next folio as we already have a folio for
+			 * the current offset.
 			 */
 			cur += (folio_sz - offset);
 			continue;
@@ -457,8 +442,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 			break;
 
 		if (filemap_add_folio(mapping, folio, pg_index, cache_gfp)) {
-			/* There is already a page, skip to page end */
-			cur += folio_size(folio);
+			/* There is already a folio, skip to the folio end. */
+			cur += folio_size(folio) - offset_in_folio(folio, cur);
 			folio_put(folio);
 			continue;
 		}
@@ -475,14 +460,14 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 			break;
 		}
 
-		page_end = (pg_index << PAGE_SHIFT) + folio_size(folio) - 1;
-		btrfs_lock_extent(tree, cur, page_end, NULL);
+		folio_end = folio_next_pos(folio) - 1;
+		btrfs_lock_extent(tree, cur, folio_end, NULL);
 		read_lock(&em_tree->lock);
-		em = btrfs_lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
+		em = btrfs_lookup_extent_mapping(em_tree, cur, folio_end + 1 - cur);
 		read_unlock(&em_tree->lock);
 
 		/*
-		 * At this point, we have a locked page in the page cache for
+		 * At this point, we have a locked folio in the page cache for
 		 * these bytes in the file.  But, we have to make sure they map
 		 * to this compressed extent on disk.
 		 */
@@ -491,14 +476,14 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 		    (btrfs_extent_map_block_start(em) >> SECTOR_SHIFT) !=
 		    orig_bio->bi_iter.bi_sector) {
 			btrfs_free_extent_map(em);
-			btrfs_unlock_extent(tree, cur, page_end, NULL);
+			btrfs_unlock_extent(tree, cur, folio_end, NULL);
 			folio_unlock(folio);
 			folio_put(folio);
 			break;
 		}
-		add_size = min(btrfs_extent_map_end(em), page_end + 1) - cur;
+		add_size = min(btrfs_extent_map_end(em), folio_end + 1) - cur;
 		btrfs_free_extent_map(em);
-		btrfs_unlock_extent(tree, cur, page_end, NULL);
+		btrfs_unlock_extent(tree, cur, folio_end, NULL);
 
 		if (folio_contains(folio, end_index)) {
 			size_t zero_offset = offset_in_folio(folio, isize);
@@ -516,13 +501,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 			folio_put(folio);
 			break;
 		}
-		/*
-		 * If it's subpage, we also need to increase its
-		 * subpage::readers number, as at endio we will decrease
-		 * subpage::readers and to unlock the page.
-		 */
-		if (fs_info->sectorsize < PAGE_SIZE)
-			btrfs_folio_set_lock(fs_info, folio, cur, add_size);
+		btrfs_folio_set_lock(fs_info, folio, cur, add_size);
 		folio_put(folio);
 		cur += add_size;
 	}
@@ -613,8 +592,8 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
 	}
 	ASSERT(cb->bbio.bio.bi_iter.bi_size == compressed_len);
 
-	add_ra_bio_pages(&inode->vfs_inode, em_start + em_len, cb, &memstall,
-			 &pflags, !(bbio->bio.bi_opf & REQ_RAHEAD));
+	add_ra_bio_folios(&inode->vfs_inode, em_start + em_len, cb, &memstall,
+			  &pflags, !(bbio->bio.bi_opf & REQ_RAHEAD));
 
 	cb->len = bbio->bio.bi_iter.bi_size;
 	cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector;
@@ -1192,22 +1171,6 @@ void __cold btrfs_exit_compress(void)
 }
 
 /*
- * The bvec is a single page bvec from a bio that contains folios from a filemap.
- *
- * Since the folio may be a large one, and if the bv_page is not a head page of
- * a large folio, then page->index is unreliable.
- *
- * Thus we need this helper to grab the proper file offset.
- */
-static u64 file_offset_from_bvec(const struct bio_vec *bvec)
-{
-	const struct page *page = bvec->bv_page;
-	const struct folio *folio = page_folio(page);
-
-	return (page_pgoff(folio, page) << PAGE_SHIFT) + bvec->bv_offset;
-}
-
-/*
  * Copy decompressed data from working buffer to pages.
  *
  * @buf:		The decompressed data buffer
@@ -1259,7 +1222,7 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
 		 * cb->start may underflow, but subtracting that value can still
 		 * give us correct offset inside the full decompressed extent.
 		 */
-		bvec_offset = file_offset_from_bvec(&bvec) - cb->start;
+		bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start;
 
 		/* Haven't reached the bvec range, exit */
 		if (decompressed + buf_len <= bvec_offset)
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index d70da290bedf..49fb6b816aa9 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -475,13 +475,10 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
 	struct extent_buffer *cow;
 	int level, ret;
 	int last_ref = 0;
-	int unlock_orig = 0;
+	const bool unlock_orig = (*cow_ret == buf);
 	u64 parent_start = 0;
 	u64 reloc_src_root = 0;
 
-	if (*cow_ret == buf)
-		unlock_orig = 1;
-
 	btrfs_assert_tree_write_locked(buf);
 
 	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
@@ -1497,17 +1494,11 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 		if (p->reada == READA_FORWARD_ALWAYS)
 			reada_for_search(fs_info, p, parent_level, slot, key->objectid);
 
-		/* first we do an atomic uptodate check */
-		if (btrfs_buffer_uptodate(tmp, check.transid, NULL) > 0) {
-			/*
-			 * Do extra check for first_key, eb can be stale due to
-			 * being cached, read from scrub, or have multiple
-			 * parents (shared tree blocks).
-			 */
-			if (unlikely(btrfs_verify_level_key(tmp, &check))) {
-				ret = -EUCLEAN;
-				goto out;
-			}
+		/* Check if the cached eb is uptodate. */
+		ret = btrfs_buffer_uptodate(tmp, check.transid, &check);
+		if (unlikely(ret < 0))
+			goto out;
+		if (ret > 0) {
 			*eb_ret = tmp;
 			tmp = NULL;
 			ret = 0;
@@ -2075,7 +2066,7 @@ again:
 	}
 
 	while (b) {
-		int dec = 0;
+		bool dec = false;
 		int ret2;
 
 		level = btrfs_header_level(b);
@@ -2158,7 +2149,7 @@ cow_done:
 		prev_cmp = ret;
 
 		if (ret && slot > 0) {
-			dec = 1;
+			dec = true;
 			slot--;
 		}
 		p->slots[level] = slot;
@@ -2288,7 +2279,7 @@ again:
 	p->locks[level] = BTRFS_READ_LOCK;
 
 	while (b) {
-		int dec = 0;
+		bool dec = false;
 		int ret2;
 
 		level = btrfs_header_level(b);
@@ -2313,7 +2304,7 @@ again:
 		}
 
 		if (ret && slot > 0) {
-			dec = 1;
+			dec = true;
 			slot--;
 		}
 		p->slots[level] = slot;
@@ -3674,7 +3665,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	int wret;
 	int split;
 	int num_doubles = 0;
-	int tried_avoid_double = 0;
+	bool tried_avoid_double = false;
 
 	l = path->nodes[0];
 	slot = path->slots[0];
@@ -3836,7 +3827,7 @@ again:
 
 push_for_double:
 	push_for_double_split(trans, root, path, data_size);
-	tried_avoid_double = 1;
+	tried_avoid_double = true;
 	if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
 		return 0;
 	goto again;
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 7e2db5d3a4d4..f0c6758b7055 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -859,23 +859,6 @@ again:
 	if (IS_ERR(folio))
 		return folio;
 
-	/*
-	 * Since we can defragment files opened read-only, we can encounter
-	 * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS).
-	 *
-	 * The IO for such large folios is not fully tested, thus return
-	 * an error to reject such folios unless it's an experimental build.
-	 *
-	 * Filesystem transparent huge pages are typically only used for
-	 * executables that explicitly enable them, so this isn't very
-	 * restrictive.
-	 */
-	if (!IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && folio_test_large(folio)) {
-		folio_unlock(folio);
-		folio_put(folio);
-		return ERR_PTR(-ETXTBSY);
-	}
-
 	ret = set_folio_extent_mapped(folio);
 	if (ret < 0) {
 		folio_unlock(folio);
@@ -1179,7 +1162,6 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
 		if (start >= folio_next_pos(folio) ||
 		    start + len <= folio_pos(folio))
 			continue;
-		btrfs_folio_clamp_clear_checked(fs_info, folio, start, len);
 		btrfs_folio_clamp_set_dirty(fs_info, folio, start, len);
 	}
 	btrfs_delalloc_release_extents(inode, len);
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 0970799d0aa4..d357ed7efd99 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -134,6 +134,8 @@ int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes)
 
 	if (btrfs_is_free_space_inode(inode))
 		flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
+	else if (btrfs_is_zoned(fs_info) && btrfs_is_data_reloc_root(root))
+		flush = BTRFS_RESERVE_FLUSH_ZONED_RELOCATION;
 
 	return btrfs_reserve_data_bytes(data_sinfo_for_inode(inode), bytes, flush);
 }
@@ -279,7 +281,7 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
 	 *
 	 * This is overestimating in most cases.
 	 */
-	qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
+	qgroup_rsv_size = ((u64)outstanding_extents << fs_info->nodesize_bits);
 
 	spin_lock(&block_rsv->lock);
 	block_rsv->size = reserve_size;
@@ -309,7 +311,7 @@ static void calc_inode_reservations(struct btrfs_inode *inode,
 	 * for an inode update.
 	 */
 	*meta_reserve += inode_update;
-	*qgroup_reserve = nr_extents * fs_info->nodesize;
+	*qgroup_reserve = (nr_extents << fs_info->nodesize_bits);
 }
 
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 605858c2d9a9..8bc1929237f7 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -615,6 +615,9 @@ static bool insert_delayed_ref(struct btrfs_trans_handle *trans,
 	struct btrfs_delayed_ref_node *exist;
 	int mod;
 
+	ASSERT(ref->action == BTRFS_ADD_DELAYED_REF ||
+	       ref->action == BTRFS_DROP_DELAYED_REF);
+
 	spin_lock(&href->lock);
 	exist = tree_insert(&href->ref_tree, ref);
 	if (!exist) {
@@ -641,7 +644,7 @@ static bool insert_delayed_ref(struct btrfs_trans_handle *trans,
 				ASSERT(!list_empty(&exist->add_list));
 				list_del_init(&exist->add_list);
 			} else {
-				ASSERT(0);
+				DEBUG_WARN();
 			}
 		} else
 			mod = -ref->ref_mod;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 8f8fa14886de..318ddb790429 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -307,6 +307,8 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	device->bdev_file = bdev_file;
 	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 	set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
+	/* Check the comment in btrfs_init_new_device() for the reason. */
+	atomic_inc(&device->dev_stats_ccnt);
 	device->dev_stats_valid = 1;
 	set_blocksize(bdev_file, BTRFS_BDEV_BLOCKSIZE);
 	device->fs_devices = fs_devices;
@@ -1013,8 +1015,15 @@ error:
 
 	/* write back the superblocks */
 	trans = btrfs_start_transaction(root, 0);
-	if (!IS_ERR(trans))
+	if (!IS_ERR(trans)) {
+		/*
+		 * Ignore any error here, if we failed to remove the DEV_STATS
+		 * item for devid 0, it's not a big deal.  We have other ways
+		 * to address it.
+		 */
+		btrfs_remove_dev_stat_item(trans, BTRFS_DEV_REPLACE_DEVID);
 		btrfs_commit_transaction(trans);
+	}
 
 	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
index 57167d56dc72..460326d34143 100644
--- a/fs/btrfs/direct-io.c
+++ b/fs/btrfs/direct-io.c
@@ -15,10 +15,12 @@
 
 struct btrfs_dio_data {
 	ssize_t submitted;
+	loff_t old_isize;
 	struct extent_changeset *data_reserved;
 	struct btrfs_ordered_extent *ordered;
 	bool data_space_reserved;
 	bool nocow_done;
+	bool updated_isize;
 };
 
 struct btrfs_dio_private {
@@ -228,6 +230,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
 	bool space_reserved = false;
 	u64 len = *lenp;
 	u64 prev_len;
+	loff_t old_isize;
 	int ret = 0;
 
 	/*
@@ -341,8 +344,14 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
 	 * Need to update the i_size under the extent lock so buffered
 	 * readers will get the updated i_size when we unlock.
 	 */
-	if (start + len > i_size_read(inode))
+	old_isize = i_size_read(inode);
+	if (start + len > old_isize) {
+		if (!dio_data->updated_isize) {
+			dio_data->old_isize = old_isize;
+			dio_data->updated_isize = true;
+		}
 		i_size_write(inode, start + len);
+	}
 out:
 	if (ret && space_reserved) {
 		btrfs_delalloc_release_extents(BTRFS_I(inode), len);
@@ -624,12 +633,55 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
 	if (submitted < length) {
 		pos += submitted;
 		length -= submitted;
-		if (write)
+		if (write) {
+			/*
+			 * Got a short write and have updated the isize, need to
+			 * revert the isize change.
+			 *
+			 * Normally we need to update isize with extent lock hold,
+			 * but we're safe due to the following factors:
+			 *
+			 * - Only a single writer can be enlarging isize
+			 *   Enlarging isize will take the exclusive inode lock.
+			 *
+			 * - Buffered readers need to wait for the OE we're holding
+			 *   Buffered readers will lock extent and wait for OE
+			 *   of the folio range, and since page cache is invalidated
+			 *   the OE wait can not be skipped.
+			 *
+			 * So here we are safe to revert the isize before
+			 * finishing the OE, and no reader of the remaining range
+			 * can see the enlarged size.
+			 *
+			 * TODO: Extend the DIO_LOCKED lifespan for direct writes,
+			 * and only enlarge isize after a successful write.
+			 */
+			if (dio_data->updated_isize) {
+				u64 new_isize;
+
+				if (submitted == 0)
+					new_isize = dio_data->old_isize;
+				else
+					new_isize = max(dio_data->old_isize, pos);
+				i_size_write(inode, new_isize);
+				dio_data->updated_isize = false;
+			}
+			/*
+			 * We have a short write, if there is any range
+			 * that is submitted properly, that part will have
+			 * its own OE split from the original one.
+			 *
+			 * So for the OE at dio_data->ordered, it's the part
+			 * that is not submitted, and should be marked
+			 * as fully truncated.
+			 */
+			btrfs_mark_ordered_extent_truncated(dio_data->ordered, 0);
 			btrfs_finish_ordered_extent(dio_data->ordered,
-						    pos, length, false);
-		else
+						    pos, length, true);
+		} else {
 			btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
 						pos + length - 1, NULL);
+		}
 		ret = -ENOTBLK;
 	}
 	if (write) {
@@ -926,7 +978,7 @@ again:
 	if (ret > 0)
 		written = ret;
 
-	if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
+	if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret >= 0)) {
 		const size_t left = iov_iter_count(from);
 		/*
 		 * We have more data left to write. Try to fault in as many as
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c0a30bb213d7..0a7d80da9c94 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -215,7 +215,7 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb,
 			     const struct btrfs_tree_parent_check *check)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
-	int failed = 0;
+	bool failed = false;
 	int ret;
 	int num_copies = 0;
 	int mirror_num = 0;
@@ -234,7 +234,7 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb,
 			break;
 
 		if (!failed_mirror) {
-			failed = 1;
+			failed = true;
 			failed_mirror = eb->read_mirror;
 		}
 
@@ -491,10 +491,34 @@ static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags)
 static void btree_invalidate_folio(struct folio *folio, size_t offset,
 				 size_t length)
 {
-	struct extent_io_tree *tree;
+	struct extent_io_tree *tree = &folio_to_inode(folio)->io_tree;
+	struct extent_state *cached_state = NULL;
+	const u64 start = folio_pos(folio);
+	const u64 end = folio_next_pos(folio) - 1;
+
+	/*
+	 * The range must cover the full @folio.
+	 * Btree inode is never exposed to regular file operations, thus there
+	 * is no partial truncation.
+	 * The folio is only invalidated when the btree inode is evicted.
+	 */
+	ASSERT(offset == 0, "folio=%llu offset=%zu", folio_pos(folio), offset);
+	ASSERT(length == folio_size(folio), "folio=%llu folio_size=%zu length=%zu",
+	       folio_pos(folio), folio_size(folio), length);
+
+	/* This function is only called for the btree inode */
+	ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
+
+	btrfs_lock_extent(tree, start, end, &cached_state);
+	folio_wait_writeback(folio);
+
+	/*
+	 * Currently for btree io tree, only EXTENT_LOCKED is utilized,
+	 * so here we only need to unlock the extent range to free any
+	 * existing extent state.
+	 */
+	btrfs_unlock_extent(tree, start, end, &cached_state);
 
-	tree = &folio_to_inode(folio)->io_tree;
-	extent_invalidate_folio(tree, folio, offset);
 	btree_release_folio(folio, GFP_NOFS);
 	if (folio_get_private(folio)) {
 		btrfs_warn(folio_to_fs_info(folio),
@@ -539,7 +563,7 @@ static bool btree_dirty_folio(struct address_space *mapping,
 			continue;
 		}
 		spin_unlock_irqrestore(&subpage->lock, flags);
-		cur = page_start + cur_bit * fs_info->sectorsize;
+		cur = page_start + (cur_bit << fs_info->sectorsize_bits);
 
 		eb = find_extent_buffer(fs_info, cur);
 		ASSERT(eb);
@@ -1736,7 +1760,6 @@ static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
 /* helper to cleanup workers */
 static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 {
-	btrfs_destroy_workqueue(fs_info->fixup_workers);
 	btrfs_destroy_workqueue(fs_info->delalloc_workers);
 	btrfs_destroy_workqueue(fs_info->workers);
 	if (fs_info->endio_workers)
@@ -1944,9 +1967,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
 	fs_info->caching_workers =
 		btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
 
-	fs_info->fixup_workers =
-		btrfs_alloc_ordered_workqueue(fs_info, "fixup", ordered_flags);
-
 	fs_info->endio_workers =
 		alloc_workqueue("btrfs-endio", flags, max_active);
 	fs_info->endio_meta_workers =
@@ -1972,7 +1992,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
 	      fs_info->endio_workers && fs_info->endio_meta_workers &&
 	      fs_info->endio_write_workers &&
 	      fs_info->endio_freespace_worker && fs_info->rmw_workers &&
-	      fs_info->caching_workers && fs_info->fixup_workers &&
+	      fs_info->caching_workers &&
 	      fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
 	      fs_info->discard_ctl.discard_workers)) {
 		return -ENOMEM;
@@ -2776,6 +2796,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 	mutex_init(&fs_info->unused_bg_unpin_mutex);
 	mutex_init(&fs_info->reclaim_bgs_lock);
 	mutex_init(&fs_info->reloc_mutex);
+	spin_lock_init(&fs_info->reloc_ctl_lock);
 	mutex_init(&fs_info->delalloc_root_mutex);
 	mutex_init(&fs_info->zoned_meta_io_lock);
 	mutex_init(&fs_info->zoned_data_reloc_io_lock);
@@ -3276,6 +3297,64 @@ static bool fs_is_full_ro(const struct btrfs_fs_info *fs_info)
 	return false;
 }
 
+/*
+ * Try to wait for any metadata readahead, and invalidate all btree folios.
+ *
+ * If the invalidation failed, report any dirty/held extent buffers.
+ */
+static void invalidate_and_check_btree_folios(struct btrfs_fs_info *fs_info)
+{
+	unsigned long index = 0;
+	struct extent_buffer *eb;
+	int ret;
+
+	ret = invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+	if (likely(ret == 0))
+		return;
+
+	/*
+	 * Some btree pages can not be invalidated, this happens when some tree
+	 * blocks are still held (either by readahead or some task is holding a ref).
+	 */
+	rcu_read_lock();
+	xa_for_each(&fs_info->buffer_tree, index, eb) {
+		/* Increase the ref so that the eb won't disappear. */
+		if (!refcount_inc_not_zero(&eb->refs))
+			continue;
+		rcu_read_unlock();
+
+		/* Wait for any readahead first. */
+		if (test_bit(EXTENT_BUFFER_READING, &eb->bflags))
+			wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING,
+				       TASK_UNINTERRUPTIBLE);
+		/*
+		 * The refs threshold is 2, one held by us at the beginning
+		 * of the loop, one for the ownership in the buffer tree.
+		 */
+		if (unlikely(refcount_read(&eb->refs) > 2 || extent_buffer_under_io(eb))) {
+			WARN_ON_ONCE(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+			btrfs_warn(fs_info,
+			"unable to release extent buffer %llu owner %llu gen %llu refs %u flags 0x%lx",
+				   eb->start, btrfs_header_owner(eb),
+				   btrfs_header_generation(eb),
+				   refcount_read(&eb->refs), eb->bflags);
+		}
+		free_extent_buffer(eb);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+}
+
+static u32 calc_block_max_order(u32 sectorsize_bits)
+{
+	u32 max_size;
+
+	max_size = min(BTRFS_MAX_BLOCKS_PER_FOLIO << sectorsize_bits,
+		       BTRFS_MAX_FOLIO_SIZE);
+	return ilog2(round_up(max_size, PAGE_SIZE) >> PAGE_SHIFT);
+}
+
 int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices)
 {
 	u32 sectorsize;
@@ -3398,7 +3477,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	fs_info->sectorsize = sectorsize;
 	fs_info->sectorsize_bits = ilog2(sectorsize);
 	fs_info->block_min_order = ilog2(round_up(sectorsize, PAGE_SIZE) >> PAGE_SHIFT);
-	fs_info->block_max_order = ilog2((BITS_PER_LONG << fs_info->sectorsize_bits) >> PAGE_SHIFT);
+	fs_info->block_max_order = calc_block_max_order(fs_info->sectorsize_bits);
 	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
 	fs_info->stripesize = stripesize;
 	fs_info->fs_devices->fs_info = fs_info;
@@ -3451,7 +3530,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	/* Update the values for the current filesystem. */
 	sb->s_blocksize = sectorsize;
 	sb->s_blocksize_bits = blksize_bits(sectorsize);
-	memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
+	/*
+	 * When temp_fsid is active, fs_devices->fsid is assigned a random UUID
+	 * at mount. This inconsistent UUID causes issues for layered filesystems
+	 * like OverlayFS. Since metadata_uuid may or may not be set, provide the
+	 * on-disk UUID directly from the super_copy.
+	 */
+	if (fs_info->fs_devices->temp_fsid)
+		memcpy(&sb->s_uuid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
+	else
+		memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
 
 	mutex_lock(&fs_info->chunk_mutex);
 	ret = btrfs_read_sys_array(fs_info);
@@ -3591,6 +3679,13 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 		}
 	}
 
+	ret = btrfs_init_writeback_bio_size(fs_info);
+	if (ret) {
+		btrfs_err(fs_info, "failed to get optimum writeback size: %d",
+			  ret);
+		goto fail_sysfs;
+	}
+
 	btrfs_free_zone_cache(fs_info);
 
 	btrfs_check_active_zone_reservation(fs_info);
@@ -3706,7 +3801,7 @@ fail_tree_roots:
 	if (fs_info->data_reloc_root)
 		btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root);
 	free_root_pointers(fs_info, true);
-	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+	invalidate_and_check_btree_folios(fs_info);
 
 fail_sb_buffer:
 	btrfs_stop_all_workers(fs_info);
@@ -4209,7 +4304,6 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
 		list_del_init(&trans->list);
 
 		btrfs_put_transaction(trans);
-		trace_btrfs_transaction_commit(fs_info);
 	}
 	ASSERT(!found);
 }
@@ -4280,16 +4374,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
 		btrfs_error_commit_super(fs_info);
 
 	/*
-	 * Wait for any fixup workers to complete.
-	 * If we don't wait for them here and they are still running by the time
-	 * we call kthread_stop() against the cleaner kthread further below, we
-	 * get an use-after-free on the cleaner because the fixup worker adds an
-	 * inode to the list of delayed iputs and then attempts to wakeup the
-	 * cleaner kthread, which was already stopped and destroyed. We parked
-	 * already the cleaner, but below we run all pending delayed iputs.
-	 */
-	btrfs_flush_workqueue(fs_info->fixup_workers);
-	/*
 	 * Similar case here, we have to wait for delalloc workers before we
 	 * proceed below and stop the cleaner kthread, otherwise we trigger a
 	 * use-after-tree on the cleaner kthread task_struct when a delalloc
@@ -4412,7 +4496,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
 	ASSERT(list_empty(&fs_info->delayed_iputs));
 	set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
 
-	if (btrfs_check_quota_leak(fs_info)) {
+	if (unlikely(btrfs_check_quota_leak(fs_info))) {
 		DEBUG_WARN("qgroup reserved space leaked");
 		btrfs_err(fs_info, "qgroup reserved space leaked");
 	}
@@ -4445,7 +4529,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
 	 * We must make sure there is not any read request to
 	 * submit after we stop all workers.
 	 */
-	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+	invalidate_and_check_btree_folios(fs_info);
 	btrfs_stop_all_workers(fs_info);
 
 	/*
@@ -4881,7 +4965,6 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
 		spin_unlock(&fs_info->trans_lock);
 
 		btrfs_put_transaction(t);
-		trace_btrfs_transaction_commit(fs_info);
 		spin_lock(&fs_info->trans_lock);
 	}
 	spin_unlock(&fs_info->trans_lock);
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 626702244809..c18ea5ef2974 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -1763,7 +1763,7 @@ u64 btrfs_count_range_bits(struct extent_io_tree *tree,
 	u64 cur_start = *start;
 	u64 total_bytes = 0;
 	u64 last = 0;
-	int found = 0;
+	bool found = false;
 
 	if (WARN_ON(search_end < cur_start))
 		return 0;
@@ -1817,7 +1817,7 @@ search:
 				break;
 			if (!found) {
 				*start = max(cur_start, state->start);
-				found = 1;
+				found = true;
 			}
 			last = state->end;
 		} else if (contig && found) {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 391fad41c3b6..624d76e0ca01 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -385,7 +385,7 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
 					return type;
 			}
 		} else {
-			ASSERT(is_data == BTRFS_REF_TYPE_ANY);
+			ASSERT(is_data == BTRFS_REF_TYPE_ANY, "is_data=%d", is_data);
 			return type;
 		}
 	}
@@ -1699,13 +1699,13 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	u32 item_size;
 	int ret;
-	int metadata = 1;
+	bool metadata = true;
 
 	if (TRANS_ABORTED(trans))
 		return 0;
 
 	if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
-		metadata = 0;
+		metadata = false;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -1745,7 +1745,7 @@ again:
 			}
 			if (ret > 0) {
 				btrfs_release_path(path);
-				metadata = 0;
+				metadata = false;
 
 				key.objectid = head->bytenr;
 				key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -2108,7 +2108,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			locked_ref = btrfs_select_ref_head(fs_info, delayed_refs);
 			if (IS_ERR_OR_NULL(locked_ref)) {
 				if (PTR_ERR(locked_ref) == -EAGAIN) {
-					continue;
+					count++;
+					goto again;
 				} else {
 					break;
 				}
@@ -2156,7 +2157,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 		 * Either success case or btrfs_run_delayed_refs_for_head
 		 * returned -EAGAIN, meaning we need to select another head
 		 */
-
+again:
 		locked_ref = NULL;
 		cond_resched();
 	} while ((min_bytes != U64_MAX && bytes_processed < min_bytes) ||
@@ -2531,8 +2532,11 @@ int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset,
 				struct btrfs_key key;
 
 				btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-				ASSERT(key.objectid == bytenr);
-				ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY);
+				ASSERT(key.objectid == bytenr,
+				       "key.objectid=%llu bytenr=%llu",
+				       key.objectid, bytenr);
+				ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY, "key.type=%u",
+				       key.type);
 			}
 		}
 
@@ -3279,7 +3283,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	int ret;
 	int is_data;
 	int extent_slot = 0;
-	int found_extent = 0;
+	bool found_extent = false;
 	int num_to_del = 1;
 	int refs_to_drop = node->ref_mod;
 	u32 item_size;
@@ -3335,12 +3339,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 				break;
 			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
 			    key.offset == num_bytes) {
-				found_extent = 1;
+				found_extent = true;
 				break;
 			}
 			if (key.type == BTRFS_METADATA_ITEM_KEY &&
 			    key.offset == owner_objectid) {
-				found_extent = 1;
+				found_extent = true;
 				break;
 			}
 
@@ -4598,10 +4602,12 @@ static noinline int find_free_extent(struct btrfs_root *root,
 		/* Use dedicated sub-space_info for dedicated block group users. */
 		if (ffe_ctl->for_data_reloc) {
 			space_info = space_info->sub_group[0];
-			ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);
+			ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC,
+			       "space_info->subgroup_id=%d", space_info->subgroup_id);
 		} else if (ffe_ctl->for_treelog) {
 			space_info = space_info->sub_group[0];
-			ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_TREELOG);
+			ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_TREELOG,
+			       "space_info->subgroup_id=%d", space_info->subgroup_id);
 		}
 	}
 	if (!space_info) {
@@ -5781,16 +5787,21 @@ static int check_next_block_uptodate(struct btrfs_trans_handle *trans,
 
 	generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]);
 
-	if (btrfs_buffer_uptodate(next, generation, NULL))
-		return 0;
-
 	check.level = level - 1;
 	check.transid = generation;
 	check.owner_root = btrfs_root_id(root);
 	check.has_first_key = true;
 	btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, path->slots[level]);
 
+	ret = btrfs_buffer_uptodate(next, generation, &check);
+	if (ret > 0)
+		return 0;
 	btrfs_tree_unlock(next);
+	if (ret < 0) {
+		free_extent_buffer(next);
+		return ret;
+	}
+
 	if (level == 1)
 		reada_walk_down(trans, root, wc, path);
 	ret = btrfs_read_extent_buffer(next, &check);
@@ -6613,12 +6624,16 @@ static int btrfs_trim_free_extents_throttle(struct btrfs_device *device,
 
 	*trimmed = 0;
 
-	/* Discard not supported = nothing to do. */
-	if (!bdev_max_discard_sectors(device->bdev))
+	/*
+	 * The caller only filters out MISSING devices, but a device that was
+	 * missing at mount and later rescanned has MISSING cleared while bdev
+	 * is still NULL and WRITEABLE is still unset. Skip those here.
+	 */
+	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) || !device->bdev)
 		return 0;
 
-	/* Not writable = nothing to do. */
-	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
+	/* Discard not supported = nothing to do. */
+	if (!bdev_max_discard_sectors(device->bdev))
 		return 0;
 
 	/* No free space = nothing to do. */
@@ -6645,7 +6660,7 @@ static int btrfs_trim_free_extents_throttle(struct btrfs_device *device,
 		start = max(start, cur_start);
 
 		/* Check if there are any CHUNK_* bits left */
-		if (start > device->total_bytes) {
+		if (unlikely(start > device->total_bytes)) {
 			DEBUG_WARN();
 			btrfs_warn(fs_info,
 "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2275189b7860..7d604524e83c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -130,7 +130,8 @@ struct btrfs_bio_ctrl {
 	 * extent_writepage_io().
 	 * This is to avoid touching ranges covered by compression/inline.
 	 */
-	unsigned long submit_bitmap;
+	unsigned long submit_bitmap[BITS_TO_LONGS(BTRFS_MAX_BLOCKS_PER_FOLIO)];
+
 	struct readahead_control *ractl;
 
 	/*
@@ -250,8 +251,6 @@ static void process_one_folio(struct btrfs_fs_info *fs_info,
 	ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
 	len = end + 1 - start;
 
-	if (page_ops & PAGE_SET_ORDERED)
-		btrfs_folio_clamp_set_ordered(fs_info, folio, start, len);
 	if (page_ops & PAGE_START_WRITEBACK) {
 		btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len);
 		btrfs_folio_clamp_set_writeback(fs_info, folio, start, len);
@@ -381,7 +380,7 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
 	bool found;
 	struct extent_state *cached_state = NULL;
 	int ret;
-	int loops = 0;
+	bool loops = false;
 
 	/* Caller should pass a valid @end to indicate the search range end */
 	ASSERT(orig_end > orig_start);
@@ -438,7 +437,7 @@ again:
 		cached_state = NULL;
 		if (!loops) {
 			max_bytes = fs_info->sectorsize;
-			loops = 1;
+			loops = true;
 			goto again;
 		} else {
 			return false;
@@ -530,8 +529,6 @@ static void end_bbio_data_write(struct btrfs_bio *bbio)
 		u32 len = fi.length;
 
 		bio_size += len;
-		ASSERT(btrfs_folio_test_ordered(fs_info, folio, start, len));
-		btrfs_folio_clear_ordered(fs_info, folio, start, len);
 		btrfs_folio_clear_writeback(fs_info, folio, start, len);
 	}
 
@@ -620,24 +617,24 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
 }
 
 /*
- * Populate every free slot in a provided array with folios using GFP_NOFS.
+ * Populate every free slot in a provided array with folios.
  *
- * @nr_folios:   number of folios to allocate
- * @order:	 the order of the folios to be allocated
- * @folio_array: the array to fill with folios; any existing non-NULL entries in
- *		 the array will be skipped
+ * @nr_folios:    number of folios to allocate
+ * @order:	  folio order
+ * @folio_array:  array to fill with folios; non-NULL entries are skipped
+ * @gfp:          GFP flags for the allocation
  *
  * Return: 0        if all folios were able to be allocated;
  *         -ENOMEM  otherwise, the partially allocated folios would be freed and
  *                  the array slots zeroed
  */
 int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order,
-			    struct folio **folio_array)
+			    struct folio **folio_array, gfp_t gfp)
 {
 	for (int i = 0; i < nr_folios; i++) {
 		if (folio_array[i])
 			continue;
-		folio_array[i] = folio_alloc(GFP_NOFS, order);
+		folio_array[i] = folio_alloc(gfp, order);
 		if (!folio_array[i])
 			goto error;
 	}
@@ -652,21 +649,18 @@ error:
 }
 
 /*
- * Populate every free slot in a provided array with pages, using GFP_NOFS.
+ * Populate every free slot in a provided array with pages.
  *
- * @nr_pages:   number of pages to allocate
- * @page_array: the array to fill with pages; any existing non-null entries in
- *		the array will be skipped
- * @nofail:	whether using __GFP_NOFAIL flag
+ * @nr_pages:    number of pages to allocate
+ * @page_array:  array to fill; non-NULL entries are skipped
+ * @gfp:         GFP flags for the allocation
  *
  * Return: 0        if all pages were able to be allocated;
  *         -ENOMEM  otherwise, the partially allocated pages would be freed and
  *                  the array slots zeroed
  */
-int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
-			   bool nofail)
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, gfp_t gfp)
 {
-	const gfp_t gfp = nofail ? (GFP_NOFS | __GFP_NOFAIL) : GFP_NOFS;
 	unsigned int allocated;
 
 	for (allocated = 0; allocated < nr_pages;) {
@@ -690,13 +684,13 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
  *
  * For now, the folios populated are always in order 0 (aka, single page).
  */
-static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail)
+static int alloc_eb_folio_array(struct extent_buffer *eb, gfp_t gfp)
 {
 	struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 };
 	int num_pages = num_extent_pages(eb);
 	int ret;
 
-	ret = btrfs_alloc_page_array(num_pages, page_array, nofail);
+	ret = btrfs_alloc_page_array(num_pages, page_array, gfp);
 	if (ret < 0)
 		return ret;
 
@@ -729,9 +723,9 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
 		bio_end_sector(bio) == sector;
 }
 
-static void alloc_new_bio(struct btrfs_inode *inode,
-			  struct btrfs_bio_ctrl *bio_ctrl,
-			  u64 disk_bytenr, u64 file_offset)
+static int alloc_new_bio(struct btrfs_inode *inode,
+			 struct btrfs_bio_ctrl *bio_ctrl,
+			 u64 disk_bytenr, u64 file_offset)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct btrfs_bio *bbio;
@@ -748,13 +742,25 @@ static void alloc_new_bio(struct btrfs_inode *inode,
 	if (bio_ctrl->wbc) {
 		struct btrfs_ordered_extent *ordered;
 
+		/* This must be a write for data inodes. */
+		ASSERT(btrfs_op(&bio_ctrl->bbio->bio) == BTRFS_MAP_WRITE);
+		ASSERT(is_data_inode(inode));
+
 		ordered = btrfs_lookup_ordered_extent(inode, file_offset);
-		if (ordered) {
-			bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
-					ordered->file_offset +
-					ordered->disk_num_bytes - file_offset);
-			bbio->ordered = ordered;
+		if (unlikely(!ordered)) {
+			bio_ctrl->bbio = NULL;
+			bio_ctrl->next_file_offset = 0;
+			bio_put(&bbio->bio);
+			btrfs_err_rl(fs_info,
+	"root %lld ino %llu file offset %llu is marked dirty without notifying the fs",
+				     btrfs_root_id(inode->root), btrfs_ino(inode),
+				     file_offset);
+			return -EUCLEAN;
 		}
+		bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
+				ordered->file_offset +
+				ordered->disk_num_bytes - file_offset);
+		bbio->ordered = ordered;
 
 		/*
 		 * Pick the last added device to support cgroup writeback.  For
@@ -765,6 +771,7 @@ static void alloc_new_bio(struct btrfs_inode *inode,
 		bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
 		wbc_init_bio(bio_ctrl->wbc, &bbio->bio);
 	}
+	return 0;
 }
 
 /*
@@ -780,14 +787,19 @@ static void alloc_new_bio(struct btrfs_inode *inode,
  * new one in @bio_ctrl->bbio.
  * The mirror number for this IO should already be initialized in
  * @bio_ctrl->mirror_num.
+ *
+ * Return the number of bytes that are queued into a bio.
+ * If the returned bytes is smaller than @size, it means we hit a critical error
+ * for data write, where there is no ordered extent for the range.
  */
-static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
-			       u64 disk_bytenr, struct folio *folio,
-			       size_t size, unsigned long pg_offset,
-			       u64 read_em_generation)
+static unsigned int submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
+					u64 disk_bytenr, struct folio *folio,
+					size_t size, unsigned long pg_offset,
+					u64 read_em_generation)
 {
 	struct btrfs_inode *inode = folio_to_inode(folio);
 	loff_t file_offset = folio_pos(folio) + pg_offset;
+	unsigned int queued = 0;
 
 	ASSERT(pg_offset + size <= folio_size(folio));
 	ASSERT(bio_ctrl->end_io_func);
@@ -800,8 +812,13 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
 		u32 len = size;
 
 		/* Allocate new bio if needed */
-		if (!bio_ctrl->bbio)
-			alloc_new_bio(inode, bio_ctrl, disk_bytenr, file_offset);
+		if (!bio_ctrl->bbio) {
+			int ret;
+
+			ret = alloc_new_bio(inode, bio_ctrl, disk_bytenr, file_offset);
+			if (ret < 0)
+				break;
+		}
 
 		/* Cap to the current ordered extent boundary if there is one. */
 		if (len > bio_ctrl->len_to_oe_boundary) {
@@ -829,6 +846,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
 		pg_offset += len;
 		disk_bytenr += len;
 		file_offset += len;
+		queued += len;
 
 		/*
 		 * len_to_oe_boundary defaults to U32_MAX, which isn't folio or
@@ -857,7 +875,18 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
 		/* Ordered extent boundary: move on to a new bio. */
 		if (bio_ctrl->len_to_oe_boundary == 0)
 			submit_one_bio(bio_ctrl);
+		/*
+		 * If we have accumulated decent amount of IO, send it to the
+		 * block layer so that IO can run while we are accumulating
+		 * more folios to write.
+		 */
+		else if (bio_ctrl->wbc &&
+			 bio_ctrl->bbio->bio.bi_iter.bi_size >=
+			    inode->root->fs_info->writeback_bio_size)
+			submit_one_bio(bio_ctrl);
+
 	} while (size);
+	return queued;
 }
 
 static int attach_extent_buffer_folio(struct extent_buffer *eb,
@@ -922,6 +951,17 @@ void clear_folio_extent_mapped(struct folio *folio)
 	struct btrfs_fs_info *fs_info;
 
 	ASSERT(folio->mapping);
+	/*
+	 * The folio should not have writeback nor dirty flag set.
+	 *
+	 * If dirty flag is set, the folio can be written back again and we
+	 * expect the private flag set for the folio.
+	 *
+	 * If writeback flag is set, the endio may need to utilize the
+	 * private for btrfs_folio_state.
+	 */
+	ASSERT(!folio_test_dirty(folio));
+	ASSERT(!folio_test_writeback(folio));
 
 	if (!folio_test_private(folio))
 		return;
@@ -1030,6 +1070,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
 		u64 disk_bytenr;
 		u64 block_start;
 		u64 em_gen;
+		unsigned int queued;
 
 		ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
 		if (cur >= last_byte) {
@@ -1143,8 +1184,10 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
 
 		if (force_bio_submit)
 			submit_one_bio(bio_ctrl);
-		submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize,
-				    pg_offset, em_gen);
+		queued = submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize,
+					     pg_offset, em_gen);
+		/* Read submission should not fail. */
+		ASSERT(queued == blocksize);
 	}
 	return 0;
 }
@@ -1422,7 +1465,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 	const u64 page_start = folio_pos(folio);
 	const u64 page_end = page_start + folio_size(folio) - 1;
 	const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
-	unsigned long delalloc_bitmap = 0;
+	unsigned long delalloc_bitmap[BITS_TO_LONGS(BTRFS_MAX_BLOCKS_PER_FOLIO)] = { 0 };
 	/*
 	 * Save the last found delalloc end. As the delalloc end can go beyond
 	 * page boundary, thus we cannot rely on subpage bitmap to locate the
@@ -1447,14 +1490,9 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 	int ret = 0;
 
 	/* Save the dirty bitmap as our submission bitmap will be a subset of it. */
-	if (btrfs_is_subpage(fs_info, folio)) {
-		ASSERT(blocks_per_folio > 1);
-		btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap);
-	} else {
-		bio_ctrl->submit_bitmap = 1;
-	}
+	btrfs_copy_subpage_dirty_bitmap(fs_info, folio, bio_ctrl->submit_bitmap);
 
-	for_each_set_bitrange(start_bit, end_bit, &bio_ctrl->submit_bitmap,
+	for_each_set_bitrange(start_bit, end_bit, bio_ctrl->submit_bitmap,
 			      blocks_per_folio) {
 		u64 start = page_start + (start_bit << fs_info->sectorsize_bits);
 		u32 len = (end_bit - start_bit) << fs_info->sectorsize_bits;
@@ -1470,7 +1508,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 			delalloc_start = delalloc_end + 1;
 			continue;
 		}
-		set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start,
+		set_delalloc_bitmap(folio, delalloc_bitmap, delalloc_start,
 				    min(delalloc_end, page_end) + 1 - delalloc_start);
 		last_delalloc_end = delalloc_end;
 		delalloc_start = delalloc_end + 1;
@@ -1496,7 +1534,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 			found_len = last_delalloc_end + 1 - found_start;
 			found = true;
 		} else {
-			found = find_next_delalloc_bitmap(folio, &delalloc_bitmap,
+			found = find_next_delalloc_bitmap(folio, delalloc_bitmap,
 					delalloc_start, &found_start, &found_len);
 		}
 		if (!found)
@@ -1530,7 +1568,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 					     btrfs_ino(inode),
 					     folio_pos(folio),
 					     blocks_per_folio,
-					     &bio_ctrl->submit_bitmap,
+					     bio_ctrl->submit_bitmap,
 					     found_start, found_len, ret);
 		} else {
 			/*
@@ -1555,7 +1593,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 						 fs_info->sectorsize_bits;
 			unsigned int end_bit = (min(page_end + 1, found_start + found_len) -
 						page_start) >> fs_info->sectorsize_bits;
-			bitmap_clear(&bio_ctrl->submit_bitmap, start_bit, end_bit - start_bit);
+			bitmap_clear(bio_ctrl->submit_bitmap, start_bit, end_bit - start_bit);
 		}
 		/*
 		 * Above btrfs_run_delalloc_range() may have unlocked the folio,
@@ -1576,12 +1614,11 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 				fs_info->sectorsize_bits,
 				blocks_per_folio);
 
-		for_each_set_bitrange(start_bit, end_bit, &bio_ctrl->submit_bitmap,
+		for_each_set_bitrange(start_bit, end_bit, bio_ctrl->submit_bitmap,
 				      bitmap_size) {
 			u64 start = page_start + (start_bit << fs_info->sectorsize_bits);
 			u32 len = (end_bit - start_bit) << fs_info->sectorsize_bits;
 
-			btrfs_folio_clear_ordered(fs_info, folio, start, len);
 			btrfs_mark_ordered_io_finished(inode, start, len, false);
 		}
 		return ret;
@@ -1602,7 +1639,7 @@ out:
 	 * If all ranges are submitted asynchronously, we just need to account
 	 * for them here.
 	 */
-	if (bitmap_empty(&bio_ctrl->submit_bitmap, blocks_per_folio)) {
+	if (bitmap_empty(bio_ctrl->submit_bitmap, blocks_per_folio)) {
 		wbc->nr_to_write -= delalloc_to_write;
 		return 1;
 	}
@@ -1637,6 +1674,7 @@ static int submit_one_sector(struct btrfs_inode *inode,
 	u64 extent_offset;
 	u64 em_end;
 	const u32 sectorsize = fs_info->sectorsize;
+	unsigned int queued;
 
 	ASSERT(IS_ALIGNED(filepos, sectorsize));
 
@@ -1658,7 +1696,6 @@ static int submit_one_sector(struct btrfs_inode *inode,
 		 * ordered extent.
 		 */
 		btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
-		btrfs_folio_clear_ordered(fs_info, folio, filepos, sectorsize);
 		btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize);
 		btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize);
 
@@ -1703,8 +1740,14 @@ static int submit_one_sector(struct btrfs_inode *inode,
 	 */
 	ASSERT(folio_test_writeback(folio));
 
-	submit_extent_folio(bio_ctrl, disk_bytenr, folio,
-			    sectorsize, filepos - folio_pos(folio), 0);
+	queued = submit_extent_folio(bio_ctrl, disk_bytenr, folio,
+				     sectorsize, filepos - folio_pos(folio), 0);
+	if (unlikely(queued < sectorsize)) {
+		btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize);
+		btrfs_mark_ordered_io_finished(inode, filepos, fs_info->sectorsize,
+					       false);
+		return -EUCLEAN;
+	}
 	return 0;
 }
 
@@ -1723,7 +1766,6 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
 						  loff_t i_size)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-	unsigned long range_bitmap = 0;
 	bool submitted_io = false;
 	int found_error = 0;
 	const u64 end = start + len;
@@ -1738,28 +1780,18 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
 	ASSERT(end <= folio_end, "start=%llu len=%u folio_start=%llu folio_size=%zu",
 	       start, len, folio_start, folio_size(folio));
 
-	ret = btrfs_writepage_cow_fixup(folio);
-	if (ret == -EAGAIN) {
-		/* Fixup worker will requeue */
-		folio_redirty_for_writepage(bio_ctrl->wbc, folio);
-		folio_unlock(folio);
-		return 1;
-	}
-	if (ret < 0) {
-		btrfs_folio_clear_dirty(fs_info, folio, start, len);
-		btrfs_folio_set_writeback(fs_info, folio, start, len);
-		btrfs_folio_clear_writeback(fs_info, folio, start, len);
-		return ret;
-	}
-
-	bitmap_set(&range_bitmap, (start - folio_pos(folio)) >> fs_info->sectorsize_bits,
-		   len >> fs_info->sectorsize_bits);
-	bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap,
-		   blocks_per_folio);
+	/* Truncate the submit bitmap to the current range. */
+	if (start > folio_start)
+		bitmap_clear(bio_ctrl->submit_bitmap, 0,
+			     (start - folio_start) >> fs_info->sectorsize_bits);
+	if (start + len < folio_end)
+		bitmap_clear(bio_ctrl->submit_bitmap,
+			     (end - folio_start) >> fs_info->sectorsize_bits,
+			     (folio_end - end) >> fs_info->sectorsize_bits);
 
 	bio_ctrl->end_io_func = end_bbio_data_write;
 
-	for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) {
+	for_each_set_bit(bit, bio_ctrl->submit_bitmap, blocks_per_folio) {
 		cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits);
 
 		if (cur >= i_size) {
@@ -1779,7 +1811,6 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
 			spin_unlock(&inode->ordered_tree_lock);
 			btrfs_put_ordered_extent(ordered);
 
-			btrfs_folio_clear_ordered(fs_info, folio, cur, fs_info->sectorsize);
 			btrfs_mark_ordered_io_finished(inode, cur, fs_info->sectorsize, true);
 			/*
 			 * This range is beyond i_size, thus we don't need to
@@ -1819,6 +1850,25 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
 	return found_error;
 }
 
+static void bio_ctrl_init_submit_bitmap(struct btrfs_fs_info *fs_info,
+					struct folio *folio,
+					struct btrfs_bio_ctrl *bio_ctrl)
+{
+	const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
+
+	ASSERT(blocks_per_folio <= BTRFS_MAX_BLOCKS_PER_FOLIO);
+
+	/*
+	 * Default to unlock the whole folio.
+	 * The proper bitmap is not initialized until writepage_delalloc().
+	 *
+	 * We're safe just to set the bitmap range [0, blocks_per_folio), as
+	 * all later usage of the bitmap will follow the same range limit.
+	 * Any bits beyond blocks_per_folio will be ignored.
+	 */
+	bitmap_set(bio_ctrl->submit_bitmap, 0, blocks_per_folio);
+}
+
 /*
  * the writepage semantics are similar to regular writepage.  extent
  * records are inserted to lock ranges in the tree, and as dirty areas
@@ -1853,12 +1903,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
 	if (folio_contains(folio, end_index))
 		folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset);
 
-	/*
-	 * Default to unlock the whole folio.
-	 * The proper bitmap can only be initialized until writepage_delalloc().
-	 */
-	bio_ctrl->submit_bitmap = (unsigned long)-1;
-
+	bio_ctrl_init_submit_bitmap(fs_info, folio, bio_ctrl);
 	/*
 	 * If the page is dirty but without private set, it's marked dirty
 	 * without informing the fs.
@@ -1867,13 +1912,9 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
 	 *
 	 * So here we check if the page has private set to rule out such
 	 * case.
-	 * But we also have a long history of relying on the COW fixup,
-	 * so here we only enable this check for experimental builds until
-	 * we're sure it's safe.
 	 */
-	if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) &&
-	    unlikely(!folio_test_private(folio))) {
-		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+	if (unlikely(!folio_test_private(folio))) {
+		DEBUG_WARN();
 		btrfs_err_rl(fs_info,
 	"root %lld ino %llu folio %llu is marked dirty without notifying the fs",
 			     btrfs_root_id(inode->root),
@@ -1901,7 +1942,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
 "failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d",
 			     btrfs_root_id(inode->root), btrfs_ino(inode),
 			     folio_pos(folio), blocks_per_folio,
-			     &bio_ctrl->submit_bitmap, ret);
+			     bio_ctrl->submit_bitmap, ret);
 
 	bio_ctrl->wbc->nr_to_write--;
 
@@ -2318,13 +2359,13 @@ int btree_writepages(struct address_space *mapping, struct writeback_control *wb
 	struct btrfs_eb_write_context ctx = { .wbc = wbc };
 	struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
 	int ret = 0;
-	int done = 0;
+	bool done = false;
 	int nr_to_write_done = 0;
 	struct eb_batch batch;
 	unsigned int nr_ebs;
 	unsigned long index;
 	unsigned long end;
-	int scanned = 0;
+	bool scanned = false;
 	xa_mark_t tag;
 
 	eb_batch_init(&batch);
@@ -2341,7 +2382,7 @@ int btree_writepages(struct address_space *mapping, struct writeback_control *wb
 		index = (wbc->range_start >> fs_info->nodesize_bits);
 		end = (wbc->range_end >> fs_info->nodesize_bits);
 
-		scanned = 1;
+		scanned = true;
 	}
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		tag = PAGECACHE_TAG_TOWRITE;
@@ -2364,7 +2405,7 @@ retry:
 					ret = 0;
 
 				if (ret) {
-					done = 1;
+					done = true;
 					break;
 				}
 				continue;
@@ -2390,7 +2431,7 @@ retry:
 		 * We hit the last page and there is more work to be done: wrap
 		 * back to the start of the file
 		 */
-		scanned = 1;
+		scanned = true;
 		index = 0;
 		goto retry;
 	}
@@ -2430,15 +2471,15 @@ static int extent_write_cache_pages(struct address_space *mapping,
 	struct writeback_control *wbc = bio_ctrl->wbc;
 	struct inode *inode = mapping->host;
 	int ret = 0;
-	int done = 0;
+	bool done = false;
 	int nr_to_write_done = 0;
 	struct folio_batch fbatch;
 	unsigned int nr_folios;
 	pgoff_t index;
 	pgoff_t end;		/* Inclusive */
 	pgoff_t done_index;
-	int range_whole = 0;
-	int scanned = 0;
+	bool range_whole = false;
+	bool scanned = false;
 	xa_mark_t tag;
 
 	/*
@@ -2466,8 +2507,8 @@ static int extent_write_cache_pages(struct address_space *mapping,
 		index = wbc->range_start >> PAGE_SHIFT;
 		end = wbc->range_end >> PAGE_SHIFT;
 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
-			range_whole = 1;
-		scanned = 1;
+			range_whole = true;
+		scanned = true;
 	}
 
 	/*
@@ -2544,14 +2585,14 @@ retry:
 			}
 
 			if (folio_test_writeback(folio) ||
-			    !folio_clear_dirty_for_io(folio)) {
+			    !folio_test_dirty(folio)) {
 				folio_unlock(folio);
 				continue;
 			}
 
 			ret = extent_writepage(folio, bio_ctrl);
 			if (ret < 0) {
-				done = 1;
+				done = true;
 				break;
 			}
 
@@ -2571,7 +2612,7 @@ retry:
 		 * We hit the last page and there is more work to be done: wrap
 		 * back to the start of the file
 		 */
-		scanned = 1;
+		scanned = true;
 		index = 0;
 
 		/*
@@ -2648,7 +2689,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
 		 * Set the submission bitmap to submit all sectors.
 		 * extent_writepage_io() will do the truncation correctly.
 		 */
-		bio_ctrl.submit_bitmap = (unsigned long)-1;
+		bio_ctrl_init_submit_bitmap(fs_info, folio, &bio_ctrl);
 		ret = extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len,
 					  &bio_ctrl, i_size);
 		if (ret == 1)
@@ -2717,38 +2758,6 @@ void btrfs_readahead(struct readahead_control *rac)
 }
 
 /*
- * basic invalidate_folio code, this waits on any locked or writeback
- * ranges corresponding to the folio, and then deletes any extent state
- * records from the tree
- */
-int extent_invalidate_folio(struct extent_io_tree *tree,
-			  struct folio *folio, size_t offset)
-{
-	struct extent_state *cached_state = NULL;
-	u64 start = folio_pos(folio);
-	u64 end = start + folio_size(folio) - 1;
-	size_t blocksize = folio_to_fs_info(folio)->sectorsize;
-
-	/* This function is only called for the btree inode */
-	ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
-
-	start += ALIGN(offset, blocksize);
-	if (start > end)
-		return 0;
-
-	btrfs_lock_extent(tree, start, end, &cached_state);
-	folio_wait_writeback(folio);
-
-	/*
-	 * Currently for btree io tree, only EXTENT_LOCKED is utilized,
-	 * so here we only need to unlock the extent range to free any
-	 * existing extent state.
-	 */
-	btrfs_unlock_extent(tree, start, end, &cached_state);
-	return 0;
-}
-
-/*
  * A helper for struct address_space_operations::release_folio, this tests for
  * areas of the folio that are locked or under IO and drops the related state
  * bits if it is safe to drop the folio.
@@ -2877,12 +2886,6 @@ next:
 	return try_release_extent_state(io_tree, folio);
 }
 
-static int extent_buffer_under_io(const struct extent_buffer *eb)
-{
-	return (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
-		test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
-}
-
 static bool folio_range_has_eb(struct folio *folio)
 {
 	struct btrfs_folio_state *bfs;
@@ -3097,7 +3100,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
 	 */
 	set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
 
-	ret = alloc_eb_folio_array(new, false);
+	ret = alloc_eb_folio_array(new, GFP_NOFS);
 	if (ret)
 		goto release_eb;
 
@@ -3138,7 +3141,7 @@ struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
 	if (!eb)
 		return NULL;
 
-	ret = alloc_eb_folio_array(eb, false);
+	ret = alloc_eb_folio_array(eb, GFP_NOFS);
 	if (ret)
 		goto release_eb;
 
@@ -3394,8 +3397,8 @@ retry:
 finish:
 	spin_lock(&mapping->i_private_lock);
 	if (existing_folio && btrfs_meta_is_subpage(fs_info)) {
-		/* We're going to reuse the existing page, can drop our folio now. */
-		__free_page(folio_page(eb->folios[i], 0));
+		/* We're going to reuse the existing folio, can drop our folio now. */
+		folio_put(eb->folios[i]);
 		eb->folios[i] = existing_folio;
 	} else if (existing_folio) {
 		struct extent_buffer *existing_eb;
@@ -3410,7 +3413,7 @@ finish:
 			return 1;
 		}
 		/* The extent buffer no longer exists, we can reuse the folio. */
-		__free_page(folio_page(eb->folios[i], 0));
+		folio_put(eb->folios[i]);
 		eb->folios[i] = existing_folio;
 	}
 	eb->folio_size = folio_size(eb->folios[i]);
@@ -3441,7 +3444,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 	struct btrfs_folio_state *prealloc = NULL;
 	u64 lockdep_owner = owner_root;
 	bool page_contig = true;
-	int uptodate = 1;
+	bool uptodate = true;
 	int ret;
 
 	if (check_eb_alignment(fs_info, start))
@@ -3491,8 +3494,12 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 	}
 
 reallocate:
-	/* Allocate all pages first. */
-	ret = alloc_eb_folio_array(eb, true);
+	/*
+	 * Allocate all pages first. These will be attached to btree_inode->i_mapping
+	 * below (added to LRU, served by btree_migrate_folio), so request
+	 * __GFP_MOVABLE so the page allocator places them in MOVABLE pageblocks.
+	 */
+	ret = alloc_eb_folio_array(eb, GFP_NOFS | __GFP_NOFAIL | __GFP_MOVABLE);
 	if (ret < 0) {
 		btrfs_free_folio_state(prealloc);
 		goto out;
@@ -3551,7 +3558,7 @@ reallocate:
 			page_contig = false;
 
 		if (!btrfs_meta_folio_test_uptodate(folio, eb))
-			uptodate = 0;
+			uptodate = false;
 
 		/*
 		 * We can't unlock the pages just yet since the extent buffer
@@ -3739,17 +3746,6 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
 	release_extent_buffer(eb);
 }
 
-static void btree_clear_folio_dirty_tag(struct folio *folio)
-{
-	ASSERT(!folio_test_dirty(folio));
-	ASSERT(folio_test_locked(folio));
-	xa_lock_irq(&folio->mapping->i_pages);
-	if (!folio_test_dirty(folio))
-		__xa_clear_mark(&folio->mapping->i_pages, folio->index,
-				PAGECACHE_TAG_DIRTY);
-	xa_unlock_irq(&folio->mapping->i_pages);
-}
-
 void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
 			      struct extent_buffer *eb)
 {
@@ -3790,7 +3786,7 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
 		folio_lock(folio);
 		last = btrfs_meta_folio_clear_and_test_dirty(folio, eb);
 		if (last)
-			btree_clear_folio_dirty_tag(folio);
+			btrfs_clear_folio_dirty_tag(folio);
 		folio_unlock(folio);
 	}
 	WARN_ON(refcount_read(&eb->refs) == 0);
@@ -3981,15 +3977,14 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num,
 	return 0;
 }
 
-static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
-			    unsigned long len)
+/* Never inlined to decrease code size, as this is called in a cold path. */
+static noinline void report_eb_range(const struct extent_buffer *eb,
+				     unsigned long start, unsigned long len)
 {
 	btrfs_warn(eb->fs_info,
 		"access to eb bytenr %llu len %u out of range start %lu len %lu",
 		eb->start, eb->len, start, len);
 	DEBUG_WARN();
-
-	return true;
 }
 
 /*
@@ -3999,14 +3994,16 @@ static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
  *
  * Caller should not touch the dst/src memory if this function returns error.
  */
-static inline int check_eb_range(const struct extent_buffer *eb,
-				 unsigned long start, unsigned long len)
+static inline bool check_eb_range(const struct extent_buffer *eb,
+				  unsigned long start, unsigned long len)
 {
 	unsigned long offset;
 
 	/* start, start + len should not go beyond eb->len nor overflow */
-	if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
-		return report_eb_range(eb, start, len);
+	if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len)) {
+		report_eb_range(eb, start, len);
+		return true;
+	}
 
 	return false;
 }
@@ -4660,7 +4657,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
 	if (IS_ERR(eb))
 		return;
 
-	if (btrfs_buffer_uptodate(eb, gen, NULL)) {
+	if (btrfs_buffer_uptodate(eb, gen, &check)) {
 		free_extent_buffer(eb);
 		return;
 	}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b310a5145cf6..9896e15ddc40 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -55,7 +55,6 @@ enum {
 	/* Page starts writeback, clear dirty bit and set writeback bit */
 	ENUM_BIT(PAGE_START_WRITEBACK),
 	ENUM_BIT(PAGE_END_WRITEBACK),
-	ENUM_BIT(PAGE_SET_ORDERED),
 };
 
 /*
@@ -327,6 +326,12 @@ static inline bool extent_buffer_uptodate(const struct extent_buffer *eb)
 	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 }
 
+static inline bool extent_buffer_under_io(const struct extent_buffer *eb)
+{
+	return (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
+		test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+}
+
 int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
 			 unsigned long start, unsigned long len);
 void read_extent_buffer(const struct extent_buffer *eb, void *dst,
@@ -381,15 +386,23 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
 				  const struct folio *locked_folio,
 				  struct extent_state **cached,
 				  u32 bits_to_clear, unsigned long page_ops);
-int extent_invalidate_folio(struct extent_io_tree *tree,
-			    struct folio *folio, size_t offset);
 void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
 			      struct extent_buffer *buf);
 
-int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
-			   bool nofail);
+static inline void btrfs_clear_folio_dirty_tag(struct folio *folio)
+{
+	ASSERT(!folio_test_dirty(folio));
+	ASSERT(folio_test_locked(folio));
+	ASSERT(folio->mapping);
+	xa_lock_irq(&folio->mapping->i_pages);
+	__xa_clear_mark(&folio->mapping->i_pages, folio->index,
+			PAGECACHE_TAG_DIRTY);
+	xa_unlock_irq(&folio->mapping->i_pages);
+}
+
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, gfp_t gfp);
 int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order,
-			    struct folio **folio_array);
+			    struct folio **folio_array, gfp_t gfp);
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 bool find_lock_delalloc_range(struct inode *inode,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6b79bff241f2..fce9c5cc0122 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -717,7 +717,7 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode,
 	 * file offset. Here just do a sanity check.
 	 */
 	if (em->disk_bytenr == EXTENT_MAP_INLINE)
-		ASSERT(em->start == 0);
+		ASSERT(em->start == 0, "em->start=%llu", em->start);
 
 	ret = add_extent_mapping(inode, em, false);
 	/* it is possible that someone inserted the extent into the tree
@@ -761,7 +761,7 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode,
 		}
 	}
 
-	ASSERT(ret == 0 || ret == -EEXIST);
+	ASSERT(ret == 0 || ret == -EEXIST, "ret=%d", ret);
 	return ret;
 }
 
@@ -943,7 +943,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
 
 				ret = add_extent_mapping(inode, split, modified);
 				/* Logic error, shouldn't happen. */
-				ASSERT(ret == 0);
+				ASSERT(ret == 0, "ret=%d", ret);
 				if (WARN_ON(ret != 0) && modified)
 					btrfs_set_inode_full_sync(inode);
 			}
diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c
index 27d361c7adc4..6263e837093e 100644
--- a/fs/btrfs/fiemap.c
+++ b/fs/btrfs/fiemap.c
@@ -112,7 +112,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
 	u64 cache_end;
 
 	/* Set at the end of extent_fiemap(). */
-	ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
+	ASSERT((flags & FIEMAP_EXTENT_LAST) == 0, "flags=0x%u", flags);
 
 	if (!cache->cached)
 		goto assign;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index d72249390030..9f6454e9db81 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -325,7 +325,9 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info,
 
 	csum_start = key.offset;
 	csum_len = (itemsize / csum_size) * sectorsize;
-	ASSERT(in_range(disk_bytenr, csum_start, csum_len));
+	ASSERT(in_range(disk_bytenr, csum_start, csum_len),
+	       "disk_bytenr=%llu csum_start=%llu csum_len=%llu",
+	       disk_bytenr, csum_start, csum_len);
 
 found:
 	ret = (min(csum_start + csum_len, disk_bytenr + len) -
@@ -1307,7 +1309,7 @@ found:
 
 	index += ins_size;
 	ins_size /= csum_size;
-	total_bytes += ins_size * fs_info->sectorsize;
+	total_bytes += (ins_size << fs_info->sectorsize_bits);
 
 	if (total_bytes < sums->len) {
 		btrfs_release_path(path);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8c171ed07008..a2a2df2df786 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -49,14 +49,6 @@ static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,
 	u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
 
 	ASSERT(block_len <= U32_MAX);
-	/*
-	 * Folio checked is some magic around finding folios that have been
-	 * modified without going through btrfs_dirty_folio().  Clear it here.
-	 * There should be no need to mark the pages accessed as
-	 * prepare_one_folio() should have marked them accessed in
-	 * prepare_one_folio() via find_or_create_page()
-	 */
-	btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len);
 	folio_unlock(folio);
 	folio_put(folio);
 }
@@ -65,7 +57,7 @@ static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,
  * After copy_folio_from_iter_atomic(), update the following things for delalloc:
  * - Mark newly dirtied folio as DELALLOC in the io tree.
  *   Used to advise which range is to be written back.
- * - Mark modified folio as Uptodate/Dirty and not needing COW fixup
+ * - Mark modified folio as Uptodate/Dirty
  * - Update inode size for past EOF write
  */
 int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos,
@@ -93,21 +85,12 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos
 
 	end_of_last_block = start_pos + num_bytes - 1;
 
-	/*
-	 * The pages may have already been dirty, clear out old accounting so
-	 * we can set things up properly
-	 */
-	btrfs_clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
-			       EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
-			       cached);
-
-	ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
-					extra_bits, cached);
+	ret = btrfs_reset_extent_delalloc(inode, start_pos, end_of_last_block,
+					  extra_bits, cached);
 	if (ret)
 		return ret;
 
 	btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes);
-	btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes);
 	btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes);
 
 	/*
@@ -158,7 +141,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	int ret;
 	int modify_tree = -1;
 	int update_refs;
-	int found = 0;
+	bool found = false;
 	struct btrfs_path *path = args->path;
 
 	args->bytes_found = 0;
@@ -266,7 +249,7 @@ next_slot:
 			goto next_slot;
 		}
 
-		found = 1;
+		found = true;
 		search_start = max(key.offset, args->start);
 		if (recow || !modify_tree) {
 			modify_tree = -1;
@@ -1232,8 +1215,11 @@ static int copy_one_range(struct btrfs_inode *inode, struct iov_iter *iter,
 		return ret;
 	reserved_len = ret;
 	/* Write range must be inside the reserved range. */
-	ASSERT(reserved_start <= start);
-	ASSERT(start + write_bytes <= reserved_start + reserved_len);
+	ASSERT(reserved_start <= start, "reserved_start=%llu start=%llu",
+	       reserved_start, start);
+	ASSERT(start + write_bytes <= reserved_start + reserved_len,
+	       "start=%llu write_bytes=%zu reserved_start=%llu reserved_len=%llu",
+	       start, write_bytes, reserved_start, reserved_len);
 
 again:
 	ret = balance_dirty_pages_ratelimited_flags(inode->vfs_inode.i_mapping,
@@ -1578,7 +1564,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		btrfs_assert_inode_locked(inode);
 	}
 
-	trace_btrfs_sync_file(file, datasync);
+	trace_btrfs_sync_file_enter(file, datasync);
 
 	btrfs_init_log_ctx(&ctx, inode);
 
@@ -1702,14 +1688,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		 * reason, it's no longer relevant.
 		 */
 		clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
-		/*
-		 * An ordered extent might have started before and completed
-		 * already with io errors, in which case the inode was not
-		 * updated and we end up here. So check the inode's mapping
-		 * for any errors that might have happened since we last
-		 * checked called fsync.
-		 */
-		ret = filemap_check_wb_err(inode->vfs_inode.i_mapping, file->f_wb_err);
 		goto out_release_extents;
 	}
 
@@ -1824,10 +1802,17 @@ out:
 	free_extent_buffer(ctx.scratch_eb);
 	ASSERT(list_empty(&ctx.list));
 	ASSERT(list_empty(&ctx.conflict_inodes));
+	ASSERT(ret <= 0, "ret=%d", ret);
+	/*
+	 * Ordered extents might have started and completed before this fsync,
+	 * so check for any io errors and advance the writeback error sequence.
+	 */
 	err = file_check_and_advance_wb_err(file);
 	if (!ret)
 		ret = err;
-	return ret > 0 ? -EIO : ret;
+	trace_btrfs_sync_file_exit(file, ret);
+
+	return ret;
 
 out_release_extents:
 	btrfs_release_log_ctx_extents(&ctx);
@@ -1966,18 +1951,7 @@ again:
 		}
 	}
 
-	/*
-	 * page_mkwrite gets called when the page is firstly dirtied after it's
-	 * faulted in, but write(2) could also dirty a page and set delalloc
-	 * bits, thus in this case for space account reason, we still need to
-	 * clear any delalloc bits within this page range since we have to
-	 * reserve data&meta space before lock_page() (see above comments).
-	 */
-	btrfs_clear_extent_bit(io_tree, page_start, end,
-			       EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
-			       EXTENT_DEFRAG, &cached_state);
-
-	ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state);
+	ret = btrfs_reset_extent_delalloc(inode, page_start, end, 0, &cached_state);
 	if (ret < 0) {
 		btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
 		goto out_unlock;
@@ -1992,7 +1966,6 @@ again:
 	if (zero_start != fsize)
 		folio_zero_range(folio, zero_start, folio_size(folio) - zero_start);
 
-	btrfs_folio_clear_checked(fs_info, folio, page_start, fsize);
 	btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
 	btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
 
@@ -2098,6 +2071,10 @@ static int fill_holes(struct btrfs_trans_handle *trans,
 	struct btrfs_file_extent_item *fi;
 	struct extent_map *hole_em;
 	struct btrfs_key key;
+	int modify_slot = -1;
+	int del_slot = -1;
+	bool update_offset = false;
+	u64 num_bytes = 0;
 	int ret;
 
 	if (btrfs_fs_incompat(fs_info, NO_HOLES))
@@ -2107,7 +2084,7 @@ static int fill_holes(struct btrfs_trans_handle *trans,
 	key.type = BTRFS_EXTENT_DATA_KEY;
 	key.offset = offset;
 
-	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret <= 0) {
 		/*
 		 * We should have dropped this offset, so if we find it then
@@ -2120,33 +2097,44 @@ static int fill_holes(struct btrfs_trans_handle *trans,
 
 	leaf = path->nodes[0];
 	if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
-		u64 num_bytes;
-
-		path->slots[0]--;
-		fi = btrfs_item_ptr(leaf, path->slots[0],
+		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 				    struct btrfs_file_extent_item);
 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
 			end - offset;
-		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
-		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
-		btrfs_set_file_extent_offset(leaf, fi, 0);
-		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
-		goto out;
+		modify_slot = path->slots[0] - 1;
 	}
-
 	if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
-		u64 num_bytes;
-
-		key.offset = offset;
-		btrfs_set_item_key_safe(trans, path, &key);
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_file_extent_item);
-		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
-			offset;
+		if (modify_slot != -1) {
+			num_bytes += btrfs_file_extent_num_bytes(leaf, fi);
+			del_slot = path->slots[0];
+		} else {
+			num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
+				end - offset;
+			modify_slot = path->slots[0];
+			update_offset = true;
+		}
+	}
+	if (modify_slot >= 0) {
+		fi = btrfs_item_ptr(leaf, modify_slot,
+				    struct btrfs_file_extent_item);
 		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
 		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+		if (update_offset) {
+			key.offset = offset;
+			btrfs_set_item_key_safe(trans, path, &key);
+		}
 		btrfs_set_file_extent_offset(leaf, fi, 0);
 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+		if (del_slot >= 0) {
+			ret = btrfs_del_items(trans, root, path, del_slot, 1);
+			if (ret) {
+				btrfs_abort_transaction(trans, ret);
+				btrfs_release_path(path);
+				return ret;
+			}
+		}
 		goto out;
 	}
 	btrfs_release_path(path);
@@ -2407,7 +2395,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
 	struct btrfs_drop_extents_args drop_args = { 0 };
 	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
-	u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
+	const u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
 	u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
 	struct btrfs_trans_handle *trans = NULL;
 	struct btrfs_block_rsv rsv;
@@ -2420,7 +2408,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
 		return -EINVAL;
 
 	btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
-	rsv.size = btrfs_calc_insert_metadata_size(fs_info, 1);
+	rsv.size = min_size;
 	rsv.failfast = true;
 
 	/*
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ab22e4f9ffdd..6009b1477232 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -433,10 +433,6 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
 
 	for (i = 0; i < io_ctl->num_pages; i++) {
 		if (io_ctl->pages[i]) {
-			btrfs_folio_clear_checked(io_ctl->fs_info,
-					page_folio(io_ctl->pages[i]),
-					page_offset(io_ctl->pages[i]),
-					PAGE_SIZE);
 			unlock_page(io_ctl->pages[i]);
 			put_page(io_ctl->pages[i]);
 		}
@@ -690,11 +686,12 @@ static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
 {
 	struct btrfs_block_group *block_group = ctl->block_group;
+	const int unit = block_group->fs_info->sectorsize;
 	u64 max_bytes;
 	u64 bitmap_bytes;
 	u64 extent_bytes;
 	u64 size = block_group->length;
-	u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
+	u64 bytes_per_bg = BITS_PER_BITMAP * unit;
 	u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
 
 	max_bitmaps = max_t(u64, max_bitmaps, 1);
@@ -703,7 +700,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
 		btrfs_err(block_group->fs_info,
 "invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu",
 			  block_group->start, block_group->length,
-			  ctl->total_bitmaps, ctl->unit, max_bitmaps,
+			  ctl->total_bitmaps, unit, max_bitmaps,
 			  bytes_per_bg);
 	ASSERT(ctl->total_bitmaps <= max_bitmaps);
 
@@ -718,7 +715,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
 	else
 		max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
 
-	bitmap_bytes = ctl->total_bitmaps * ctl->unit;
+	bitmap_bytes = ctl->total_bitmaps * unit;
 
 	/*
 	 * we want the extent entry threshold to always be at most 1/2 the max
@@ -896,8 +893,7 @@ free_cache:
 	goto out;
 }
 
-static int copy_free_space_cache(struct btrfs_block_group *block_group,
-				 struct btrfs_free_space_ctl *ctl)
+static int copy_free_space_cache(struct btrfs_free_space_ctl *ctl)
 {
 	struct btrfs_free_space *info;
 	struct rb_node *n;
@@ -912,17 +908,17 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group,
 			unlink_free_space(ctl, info, true);
 			spin_unlock(&ctl->tree_lock);
 			kmem_cache_free(btrfs_free_space_cachep, info);
-			ret = btrfs_add_free_space(block_group, offset, bytes);
+			ret = btrfs_add_free_space(ctl->block_group, offset, bytes);
 			spin_lock(&ctl->tree_lock);
 		} else {
 			u64 offset = info->offset;
-			u64 bytes = ctl->unit;
+			u64 bytes = ctl->block_group->fs_info->sectorsize;
 
 			ret = search_bitmap(ctl, info, &offset, &bytes, false);
 			if (ret == 0) {
 				bitmap_clear_bits(ctl, info, offset, bytes, true);
 				spin_unlock(&ctl->tree_lock);
-				ret = btrfs_add_free_space(block_group, offset,
+				ret = btrfs_add_free_space(ctl->block_group, offset,
 							   bytes);
 				spin_lock(&ctl->tree_lock);
 			} else {
@@ -1025,7 +1021,7 @@ int load_free_space_cache(struct btrfs_block_group *block_group)
 
 	if (matched) {
 		spin_lock(&tmp_ctl.tree_lock);
-		ret = copy_free_space_cache(block_group, &tmp_ctl);
+		ret = copy_free_space_cache(&tmp_ctl);
 		spin_unlock(&tmp_ctl.tree_lock);
 		/*
 		 * ret == 1 means we successfully loaded the free space cache,
@@ -1068,12 +1064,12 @@ out:
 
 static noinline_for_stack
 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
-			      struct btrfs_free_space_ctl *ctl,
 			      struct btrfs_block_group *block_group,
 			      int *entries, int *bitmaps,
 			      struct list_head *bitmap_list)
 {
 	int ret;
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_free_cluster *cluster = NULL;
 	struct btrfs_free_cluster *cluster_locked = NULL;
 	struct rb_node *node = rb_first(&ctl->free_space_offset);
@@ -1367,17 +1363,17 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
  * or an errno if it was not.
  */
 static int __btrfs_write_out_cache(struct inode *inode,
-				   struct btrfs_free_space_ctl *ctl,
 				   struct btrfs_block_group *block_group,
 				   struct btrfs_trans_handle *trans)
 {
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_io_ctl *io_ctl = &block_group->io_ctl;
 	struct extent_state *cached_state = NULL;
 	LIST_HEAD(bitmap_list);
 	int entries = 0;
 	int bitmaps = 0;
 	int ret;
-	int must_iput = 0;
+	bool must_iput = false;
 	int i_size;
 
 	if (!i_size_read(inode))
@@ -1397,7 +1393,7 @@ static int __btrfs_write_out_cache(struct inode *inode,
 			up_write(&block_group->data_rwsem);
 			BTRFS_I(inode)->generation = 0;
 			ret = 0;
-			must_iput = 1;
+			must_iput = true;
 			goto out;
 		}
 		spin_unlock(&block_group->lock);
@@ -1416,8 +1412,7 @@ static int __btrfs_write_out_cache(struct inode *inode,
 	mutex_lock(&ctl->cache_writeout_mutex);
 	/* Write out the extent entries in the free space cache */
 	spin_lock(&ctl->tree_lock);
-	ret = write_cache_extent_entries(io_ctl, ctl,
-					 block_group, &entries, &bitmaps,
+	ret = write_cache_extent_entries(io_ctl, block_group, &entries, &bitmaps,
 					 &bitmap_list);
 	if (ret)
 		goto out_nospc_locked;
@@ -1516,7 +1511,6 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
 			  struct btrfs_path *path)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
-	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct inode *inode;
 	int ret = 0;
 
@@ -1531,7 +1525,7 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
 	if (IS_ERR(inode))
 		return 0;
 
-	ret = __btrfs_write_out_cache(inode, ctl, block_group, trans);
+	ret = __btrfs_write_out_cache(inode, block_group, trans);
 	if (ret) {
 		btrfs_debug(fs_info,
 	  "failed to write free space cache for block group %llu error %d",
@@ -1571,11 +1565,11 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
 	u64 bitmap_start;
 	u64 bytes_per_bitmap;
 
-	bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
-	bitmap_start = offset - ctl->start;
+	bytes_per_bitmap = BITS_PER_BITMAP * ctl->block_group->fs_info->sectorsize;
+	bitmap_start = offset - ctl->block_group->start;
 	bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
 	bitmap_start *= bytes_per_bitmap;
-	bitmap_start += ctl->start;
+	bitmap_start += ctl->block_group->start;
 
 	return bitmap_start;
 }
@@ -1702,6 +1696,7 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
 {
 	struct rb_node *n = ctl->free_space_offset.rb_node;
 	struct btrfs_free_space *entry = NULL, *prev = NULL;
+	const int unit = ctl->block_group->fs_info->sectorsize;
 
 	lockdep_assert_held(&ctl->tree_lock);
 
@@ -1785,7 +1780,7 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
 			    prev->offset + prev->bytes > offset)
 				return prev;
 		}
-		if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
+		if (entry->offset + BITS_PER_BITMAP * unit > offset)
 			return entry;
 	} else if (entry->offset + entry->bytes > offset)
 		return entry;
@@ -1799,8 +1794,7 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
 			return NULL;
 		entry = rb_entry(n, struct btrfs_free_space, offset_index);
 		if (entry->bitmap) {
-			if (entry->offset + BITS_PER_BITMAP *
-			    ctl->unit > offset)
+			if (entry->offset + BITS_PER_BITMAP * unit > offset)
 				break;
 		} else {
 			if (entry->offset + entry->bytes > offset)
@@ -1875,18 +1869,19 @@ static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
 				     struct btrfs_free_space *info,
 				     u64 offset, u64 bytes, bool update_stat)
 {
+	const int unit = ctl->block_group->fs_info->sectorsize;
 	unsigned long start, count, end;
 	int extent_delta = -1;
 
-	start = offset_to_bit(info->offset, ctl->unit, offset);
-	count = bytes_to_bits(bytes, ctl->unit);
+	start = offset_to_bit(info->offset, unit, offset);
+	count = bytes_to_bits(bytes, unit);
 	end = start + count;
 	ASSERT(end <= BITS_PER_BITMAP);
 
 	bitmap_clear(info->bitmap, start, count);
 
 	info->bytes -= bytes;
-	if (info->max_extent_size > ctl->unit)
+	if (info->max_extent_size > unit)
 		info->max_extent_size = 0;
 
 	relink_bitmap_entry(ctl, info);
@@ -1911,11 +1906,12 @@ static void btrfs_bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
 				  struct btrfs_free_space *info, u64 offset,
 				  u64 bytes)
 {
+	const int unit = ctl->block_group->fs_info->sectorsize;
 	unsigned long start, count, end;
 	int extent_delta = 1;
 
-	start = offset_to_bit(info->offset, ctl->unit, offset);
-	count = bytes_to_bits(bytes, ctl->unit);
+	start = offset_to_bit(info->offset, unit, offset);
+	count = bytes_to_bits(bytes, unit);
 	end = start + count;
 	ASSERT(end <= BITS_PER_BITMAP);
 
@@ -1952,6 +1948,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 			 struct btrfs_free_space *bitmap_info, u64 *offset,
 			 u64 *bytes, bool for_alloc)
 {
+	const int unit = ctl->block_group->fs_info->sectorsize;
 	unsigned long found_bits = 0;
 	unsigned long max_bits = 0;
 	unsigned long bits, i;
@@ -1969,9 +1966,9 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 		return -1;
 	}
 
-	i = offset_to_bit(bitmap_info->offset, ctl->unit,
+	i = offset_to_bit(bitmap_info->offset, unit,
 			  max_t(u64, *offset, bitmap_info->offset));
-	bits = bytes_to_bits(*bytes, ctl->unit);
+	bits = bytes_to_bits(*bytes, unit);
 
 	for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
 		if (for_alloc && bits == 1) {
@@ -1991,12 +1988,12 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 	}
 
 	if (found_bits) {
-		*offset = (u64)(i * ctl->unit) + bitmap_info->offset;
-		*bytes = (u64)(found_bits) * ctl->unit;
+		*offset = (u64)(i * unit) + bitmap_info->offset;
+		*bytes = (u64)(found_bits) * unit;
 		return 0;
 	}
 
-	*bytes = (u64)(max_bits) * ctl->unit;
+	*bytes = (u64)(max_bits) * unit;
 	bitmap_info->max_extent_size = *bytes;
 	relink_bitmap_entry(ctl, bitmap_info);
 	return -1;
@@ -2054,9 +2051,9 @@ again:
 		 * to match our requested alignment
 		 */
 		if (*bytes >= align) {
-			tmp = entry->offset - ctl->start + align - 1;
+			tmp = entry->offset - ctl->block_group->start + align - 1;
 			tmp = div64_u64(tmp, align);
-			tmp = tmp * align + ctl->start;
+			tmp = tmp * align + ctl->block_group->start;
 			align_off = tmp - entry->offset;
 		} else {
 			align_off = 0;
@@ -2148,12 +2145,13 @@ static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
 			      struct btrfs_free_space *bitmap_info,
 			      u64 *offset, u64 *bytes)
 {
+	const int unit = ctl->block_group->fs_info->sectorsize;
 	u64 end;
 	u64 search_start, search_bytes;
 	int ret;
 
 again:
-	end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
+	end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * unit) - 1;
 
 	/*
 	 * We need to search for bits in this bitmap.  We could only cover some
@@ -2162,7 +2160,7 @@ again:
 	 * go searching for the next bit.
 	 */
 	search_start = *offset;
-	search_bytes = ctl->unit;
+	search_bytes = unit;
 	search_bytes = min(search_bytes, end - search_start + 1);
 	ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
 			    false);
@@ -2208,7 +2206,7 @@ again:
 		 * everything over again.
 		 */
 		search_start = *offset;
-		search_bytes = ctl->unit;
+		search_bytes = unit;
 		ret = search_bitmap(ctl, bitmap_info, &search_start,
 				    &search_bytes, false);
 		if (ret < 0 || search_start != *offset)
@@ -2225,6 +2223,7 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
 			       struct btrfs_free_space *info, u64 offset,
 			       u64 bytes, enum btrfs_trim_state trim_state)
 {
+	const int unit = ctl->block_group->fs_info->sectorsize;
 	u64 bytes_to_set = 0;
 	u64 end;
 
@@ -2241,7 +2240,7 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
 		info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
 	}
 
-	end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
+	end = info->offset + (u64)(BITS_PER_BITMAP * unit);
 
 	bytes_to_set = min(end - offset, bytes);
 
@@ -2251,7 +2250,8 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
 
 }
 
-static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
+EXPORT_FOR_TESTS
+bool btrfs_use_bitmap(struct btrfs_free_space_ctl *ctl,
 		      struct btrfs_free_space *info)
 {
 	struct btrfs_block_group *block_group = ctl->block_group;
@@ -2295,22 +2295,18 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
 	 * so allow those block groups to still be allowed to have a bitmap
 	 * entry.
 	 */
-	if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
+	if (((BITS_PER_BITMAP * fs_info->sectorsize) >> 1) > block_group->length)
 		return false;
 
 	return true;
 }
 
-static const struct btrfs_free_space_op free_space_op = {
-	.use_bitmap		= use_bitmap,
-};
-
 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
 			      struct btrfs_free_space *info)
 {
 	struct btrfs_free_space *bitmap_info;
-	struct btrfs_block_group *block_group = NULL;
-	int added = 0;
+	struct btrfs_block_group *block_group = ctl->block_group;
+	bool added = false;
 	u64 bytes, offset, bytes_added;
 	enum btrfs_trim_state trim_state;
 	int ret;
@@ -2319,18 +2315,20 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
 	offset = info->offset;
 	trim_state = info->trim_state;
 
-	if (!ctl->op->use_bitmap(ctl, info))
-		return 0;
-
-	if (ctl->op == &free_space_op)
-		block_group = ctl->block_group;
+	if (btrfs_is_testing(block_group->fs_info)) {
+		if (!block_group->fs_info->use_bitmap(ctl, info))
+			return 0;
+	} else {
+		if (!btrfs_use_bitmap(ctl, info))
+			return 0;
+	}
 again:
 	/*
 	 * Since we link bitmaps right into the cluster we need to see if we
 	 * have a cluster here, and if so and it has our bitmap we need to add
 	 * the free space to that bitmap.
 	 */
-	if (block_group && !list_empty(&block_group->cluster_list)) {
+	if (!list_empty(&block_group->cluster_list)) {
 		struct btrfs_free_cluster *cluster;
 		struct rb_node *node;
 		struct btrfs_free_space *entry;
@@ -2367,7 +2365,7 @@ no_cluster_bitmap:
 	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
 					 1, 0);
 	if (!bitmap_info) {
-		ASSERT(added == 0);
+		ASSERT(!added);
 		goto new_bitmap;
 	}
 
@@ -2375,7 +2373,7 @@ no_cluster_bitmap:
 					  trim_state);
 	bytes -= bytes_added;
 	offset += bytes_added;
-	added = 0;
+	added = false;
 
 	if (!bytes) {
 		ret = 1;
@@ -2386,7 +2384,7 @@ no_cluster_bitmap:
 new_bitmap:
 	if (info && info->bitmap) {
 		add_new_bitmap(ctl, info, offset);
-		added = 1;
+		added = true;
 		info = NULL;
 		goto again;
 	} else {
@@ -2494,6 +2492,7 @@ static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
 				     struct btrfs_free_space *info,
 				     bool update_stat)
 {
+	const int unit = ctl->block_group->fs_info->sectorsize;
 	struct btrfs_free_space *bitmap;
 	unsigned long i;
 	unsigned long j;
@@ -2505,11 +2504,11 @@ static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
 	if (!bitmap)
 		return false;
 
-	i = offset_to_bit(bitmap->offset, ctl->unit, end);
+	i = offset_to_bit(bitmap->offset, unit, end);
 	j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
 	if (j == i)
 		return false;
-	bytes = (j - i) * ctl->unit;
+	bytes = (j - i) * unit;
 	info->bytes += bytes;
 
 	/* See try_merge_free_space() comment. */
@@ -2528,6 +2527,7 @@ static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
 				       struct btrfs_free_space *info,
 				       bool update_stat)
 {
+	const int unit = ctl->block_group->fs_info->sectorsize;
 	struct btrfs_free_space *bitmap;
 	u64 bitmap_offset;
 	unsigned long i;
@@ -2547,7 +2547,7 @@ static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
 	if (!bitmap)
 		return false;
 
-	i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
+	i = offset_to_bit(bitmap->offset, unit, info->offset) - 1;
 	j = 0;
 	prev_j = (unsigned long)-1;
 	for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
@@ -2559,9 +2559,9 @@ static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
 		return false;
 
 	if (prev_j == (unsigned long)-1)
-		bytes = (i + 1) * ctl->unit;
+		bytes = (i + 1) * unit;
 	else
-		bytes = (i - prev_j) * ctl->unit;
+		bytes = (i - prev_j) * unit;
 
 	info->offset -= bytes;
 	info->bytes += bytes;
@@ -2947,13 +2947,8 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
 void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
 			       struct btrfs_free_space_ctl *ctl)
 {
-	struct btrfs_fs_info *fs_info = block_group->fs_info;
-
 	spin_lock_init(&ctl->tree_lock);
-	ctl->unit = fs_info->sectorsize;
-	ctl->start = block_group->start;
 	ctl->block_group = block_group;
-	ctl->op = &free_space_op;
 	ctl->free_space_bytes = RB_ROOT_CACHED;
 	INIT_LIST_HEAD(&ctl->trimming_ranges);
 	mutex_init(&ctl->cache_writeout_mutex);
@@ -3327,6 +3322,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
 				u64 cont1_bytes, u64 min_bytes)
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	const int unit = block_group->fs_info->sectorsize;
 	unsigned long next_zero;
 	unsigned long i;
 	unsigned long want_bits;
@@ -3339,10 +3335,10 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
 
 	lockdep_assert_held(&ctl->tree_lock);
 
-	i = offset_to_bit(entry->offset, ctl->unit,
+	i = offset_to_bit(entry->offset, unit,
 			  max_t(u64, offset, entry->offset));
-	want_bits = bytes_to_bits(bytes, ctl->unit);
-	min_bits = bytes_to_bits(min_bytes, ctl->unit);
+	want_bits = bytes_to_bits(bytes, unit);
+	min_bits = bytes_to_bits(min_bytes, unit);
 
 	/*
 	 * Don't bother looking for a cluster in this bitmap if it's heavily
@@ -3368,7 +3364,7 @@ again:
 	}
 
 	if (!found_bits) {
-		entry->max_extent_size = (u64)max_bits * ctl->unit;
+		entry->max_extent_size = (u64)max_bits * unit;
 		return -ENOSPC;
 	}
 
@@ -3379,15 +3375,15 @@ again:
 
 	total_found += found_bits;
 
-	if (cluster->max_size < found_bits * ctl->unit)
-		cluster->max_size = found_bits * ctl->unit;
+	if (cluster->max_size < found_bits * unit)
+		cluster->max_size = found_bits * unit;
 
 	if (total_found < want_bits || cluster->max_size < cont1_bytes) {
 		i = next_zero + 1;
 		goto again;
 	}
 
-	cluster->window_start = start * ctl->unit + entry->offset;
+	cluster->window_start = start * unit + entry->offset;
 	rb_erase(&entry->offset_index, &ctl->free_space_offset);
 	rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
 
@@ -3403,8 +3399,7 @@ again:
 	ret = tree_insert_offset(ctl, cluster, entry);
 	ASSERT(!ret); /* -EEXIST; Logic error */
 
-	trace_btrfs_setup_cluster(block_group, cluster,
-				  total_found * ctl->unit, 1);
+	trace_btrfs_setup_cluster(block_group, cluster, total_found * unit, 1);
 	return 0;
 }
 
@@ -4044,7 +4039,9 @@ static int trim_bitmaps(struct btrfs_block_group *block_group,
 		}
 next:
 		if (next_bitmap) {
-			offset += BITS_PER_BITMAP * ctl->unit;
+			const int unit = block_group->fs_info->sectorsize;
+
+			offset += BITS_PER_BITMAP * unit;
 			start = offset;
 		} else {
 			start += bytes;
@@ -4071,6 +4068,7 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group,
 			   u64 *trimmed, u64 start, u64 end, u64 minlen)
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	const int unit = block_group->fs_info->sectorsize;
 	int ret;
 	u64 rem = 0;
 
@@ -4091,7 +4089,7 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group,
 		goto out;
 
 	ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false);
-	div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
+	div64_u64_rem(end, BITS_PER_BITMAP * unit, &rem);
 	/* If we ended in the middle of a bitmap, reset the trimming flag */
 	if (rem)
 		reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
@@ -4310,6 +4308,7 @@ int test_check_exists(struct btrfs_block_group *cache,
 		      u64 offset, u64 bytes)
 {
 	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
+	const int unit = cache->fs_info->sectorsize;
 	struct btrfs_free_space *info;
 	int ret = 0;
 
@@ -4329,7 +4328,7 @@ have_info:
 		struct btrfs_free_space *tmp;
 
 		bit_off = offset;
-		bit_bytes = ctl->unit;
+		bit_bytes = unit;
 		ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
 		if (!ret) {
 			if (bit_off == offset) {
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 33fc3b245648..53fe8e293af1 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -74,28 +74,20 @@ enum {
 };
 
 struct btrfs_free_space_ctl {
-	spinlock_t tree_lock;
 	struct rb_root free_space_offset;
 	struct rb_root_cached free_space_bytes;
-	u64 free_space;
+	spinlock_t tree_lock;
 	int extents_thresh;
 	int free_extents;
 	int total_bitmaps;
-	int unit;
-	u64 start;
+	u64 free_space;
 	s32 discardable_extents[BTRFS_STAT_NR_ENTRIES];
 	s64 discardable_bytes[BTRFS_STAT_NR_ENTRIES];
-	const struct btrfs_free_space_op *op;
 	struct btrfs_block_group *block_group;
 	struct mutex cache_writeout_mutex;
 	struct list_head trimming_ranges;
 };
 
-struct btrfs_free_space_op {
-	bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl,
-			   struct btrfs_free_space *info);
-};
-
 struct btrfs_io_ctl {
 	void *cur, *orig;
 	struct page *page;
@@ -172,6 +164,8 @@ bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info);
 int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active);
 /* Support functions for running our sanity tests */
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+bool btrfs_use_bitmap(struct btrfs_free_space_ctl *ctl,
+		      struct btrfs_free_space *info);
 int test_add_free_space_entry(struct btrfs_block_group *cache,
 			      u64 offset, u64 bytes, bool bitmap);
 int test_check_exists(struct btrfs_block_group *cache, u64 offset, u64 bytes);
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 472b3060e5ac..1b3d82ae3de8 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -109,7 +109,7 @@ struct btrfs_free_space_info *btrfs_search_free_space_info(
 	ret = btrfs_search_slot(trans, root, &key, path, 0, cow);
 	if (ret < 0)
 		return ERR_PTR(ret);
-	if (ret != 0) {
+	if (unlikely(ret != 0)) {
 		btrfs_warn(fs_info, "missing free space info for %llu",
 			   block_group->start);
 		DEBUG_WARN();
@@ -209,7 +209,8 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
 	u64 bitmap_range, i;
 	u32 bitmap_size, flags, expected_extent_count;
 	u32 extent_count = 0;
-	int done = 0, nr;
+	bool done = false;
+	int nr;
 	int ret;
 
 	bitmap_size = free_space_bitmap_size(fs_info, block_group->length);
@@ -240,7 +241,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
 			if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
 				ASSERT(found_key.objectid == block_group->start);
 				ASSERT(found_key.offset == block_group->length);
-				done = 1;
+				done = true;
 				break;
 			} else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) {
 				u64 first, last;
@@ -353,7 +354,8 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans,
 	u32 bitmap_size, flags, expected_extent_count;
 	unsigned long nrbits, start_bit, end_bit;
 	u32 extent_count = 0;
-	int done = 0, nr;
+	bool done = false;
+	int nr;
 	int ret;
 
 	bitmap_size = free_space_bitmap_size(fs_info, block_group->length);
@@ -384,7 +386,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans,
 			if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
 				ASSERT(found_key.objectid == block_group->start);
 				ASSERT(found_key.offset == block_group->length);
-				done = 1;
+				done = true;
 				break;
 			} else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
 				unsigned long ptr;
@@ -1473,7 +1475,8 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans,
 	struct btrfs_key key, found_key;
 	struct extent_buffer *leaf;
 	u64 start, end;
-	int done = 0, nr;
+	bool done = false;
+	int nr;
 	int ret;
 
 	if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
@@ -1514,7 +1517,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans,
 			if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
 				ASSERT(found_key.objectid == block_group->start);
 				ASSERT(found_key.offset == block_group->length);
-				done = 1;
+				done = true;
 				nr++;
 				path->slots[0]--;
 				break;
@@ -1545,6 +1548,29 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static int validate_free_space_key(struct btrfs_block_group *block_group,
+				   const struct btrfs_key *key, u8 expected_type)
+{
+	const u64 end = btrfs_block_group_end(block_group);
+
+	if (unlikely(key->type != expected_type)) {
+		btrfs_err(block_group->fs_info,
+			  "block group %llu has unexpected free space key type %u, expected %u",
+			  block_group->start, key->type, expected_type);
+		return -EUCLEAN;
+	}
+
+	if (unlikely(key->objectid + key->offset > end)) {
+		btrfs_err(block_group->fs_info,
+			  "block group %llu has invalid free space key (%llu %u %llu)",
+			  block_group->start, key->objectid, key->type,
+			  key->offset);
+		return -EUCLEAN;
+	}
+
+	return 0;
+}
+
 static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
 				   struct btrfs_path *path,
 				   u32 expected_extent_count)
@@ -1576,8 +1602,9 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
 		if (key.type == BTRFS_FREE_SPACE_INFO_KEY)
 			break;
 
-		ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
-		ASSERT(key.objectid < end && key.objectid + key.offset <= end);
+		ret = validate_free_space_key(block_group, &key, BTRFS_FREE_SPACE_BITMAP_KEY);
+		if (unlikely(ret))
+			return ret;
 
 		offset = key.objectid;
 		while (offset < key.objectid + key.offset) {
@@ -1633,7 +1660,6 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
 	struct btrfs_fs_info *fs_info = block_group->fs_info;
 	struct btrfs_root *root;
 	struct btrfs_key key;
-	const u64 end = btrfs_block_group_end(block_group);
 	u64 total_found = 0;
 	u32 extent_count = 0;
 	int ret;
@@ -1654,8 +1680,9 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
 		if (key.type == BTRFS_FREE_SPACE_INFO_KEY)
 			break;
 
-		ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
-		ASSERT(key.objectid < end && key.objectid + key.offset <= end);
+		ret = validate_free_space_key(block_group, &key, BTRFS_FREE_SPACE_EXTENT_KEY);
+		if (unlikely(ret))
+			return ret;
 
 		ret = btrfs_add_new_free_space(block_group, key.objectid,
 					       key.objectid + key.offset,
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index a8aa086a4df8..5f0cfb0b5466 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -50,20 +50,26 @@ struct btrfs_subpage_info;
 struct btrfs_stripe_hash_table;
 struct btrfs_space_info;
 
+/* Minimum data and metadata block size. */
+#define BTRFS_MIN_BLOCKSIZE	(SZ_4K)
+#define BTRFS_MAX_BLOCKSIZE	(SZ_64K)
+
+/* The maximum folio size btrfs supports. */
+#define BTRFS_MAX_FOLIO_SIZE	(SZ_2M)
+static_assert(BTRFS_MAX_FOLIO_SIZE > PAGE_SIZE);
+
 /*
- * Minimum data and metadata block size.
+ * The maximum number of blocks a huge folio can support.
  *
- * Normally it's 4K, but for testing subpage block size on 4K page systems, we
- * allow DEBUG builds to accept 2K page size.
+ * Depending on the filesystem block size, the real maximum blocks per folio
+ * may also be limited by the above BTRFS_MAX_FOLIO_SIZE.
  */
-#ifdef CONFIG_BTRFS_DEBUG
-#define BTRFS_MIN_BLOCKSIZE	(SZ_2K)
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+#define BTRFS_MAX_BLOCKS_PER_FOLIO		(512)
 #else
-#define BTRFS_MIN_BLOCKSIZE	(SZ_4K)
+#define BTRFS_MAX_BLOCKS_PER_FOLIO		(BITS_PER_LONG)
 #endif
 
-#define BTRFS_MAX_BLOCKSIZE	(SZ_64K)
-
 #define BTRFS_MAX_EXTENT_SIZE SZ_128M
 
 /*
@@ -89,6 +95,10 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
 #define BTRFS_KEY_FMT			"(%llu %u %llu)"
 #define BTRFS_KEY_FMT_VALUE(key)	(key)->objectid, (key)->type, (key)->offset
 
+#define BTRFS_QGROUP_FMT		"%hu/%llu"
+#define BTRFS_QGROUP_FMT_VALUE(qgroup)	btrfs_qgroup_level((qgroup)->qgroupid), \
+					btrfs_qgroup_subvolid((qgroup)->qgroupid)
+
 /*
  * Number of metadata items necessary for an unlink operation:
  *
@@ -486,6 +496,9 @@ struct btrfs_delayed_root {
 	wait_queue_head_t wait;
 };
 
+struct btrfs_free_space_ctl;
+struct btrfs_free_space;
+
 struct btrfs_fs_info {
 	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
 	unsigned long flags;
@@ -644,6 +657,8 @@ struct btrfs_fs_info {
 	 * to protect us from the relocation code.
 	 */
 	struct mutex reloc_mutex;
+	/* Protects setting, clearing and getting fs_info->reloc_ctl. */
+	spinlock_t reloc_ctl_lock;
 
 	struct list_head trans_list;
 	struct list_head dead_roots;
@@ -698,13 +713,6 @@ struct btrfs_fs_info {
 	struct btrfs_workqueue *endio_write_workers;
 	struct btrfs_workqueue *endio_freespace_worker;
 	struct btrfs_workqueue *caching_workers;
-
-	/*
-	 * Fixup workers take dirty pages that didn't properly go through the
-	 * cow mechanism and make them safe to write.  It happens for the
-	 * sys_munmap function call path.
-	 */
-	struct btrfs_workqueue *fixup_workers;
 	struct btrfs_workqueue *delayed_workers;
 
 	struct task_struct *transaction_kthread;
@@ -881,6 +889,7 @@ struct btrfs_fs_info {
 	u32 block_min_order;
 	u32 block_max_order;
 	u32 stripesize;
+	u32 writeback_bio_size;
 	u32 csum_size;
 	u32 csums_per_leaf;
 	u32 csum_type;
@@ -958,6 +967,10 @@ struct btrfs_fs_info {
 	spinlock_t eb_leak_lock;
 	struct list_head allocated_ebs;
 #endif
+
+	/* Used by self tests only. */
+	bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl,
+			   struct btrfs_free_space *info);
 };
 
 #define folio_to_inode(_folio)	(BTRFS_I(_Generic((_folio),			\
@@ -1207,14 +1220,6 @@ static inline void btrfs_force_shutdown(struct btrfs_fs_info *fs_info)
 	}
 }
 
-/*
- * We use folio flag owner_2 to indicate there is an ordered extent with
- * unfinished IO.
- */
-#define folio_test_ordered(folio)	folio_test_owner_2(folio)
-#define folio_set_ordered(folio)	folio_set_owner_2(folio)
-#define folio_clear_ordered(folio)	folio_clear_owner_2(folio)
-
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 
 #define EXPORT_FOR_TESTS
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 509042adee07..272598f6ae77 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -226,9 +226,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off
 	u32 item_size;
 	int ret;
 
-	mutex_lock(&fs_info->reloc_mutex);
 	logical = btrfs_get_reloc_bg_bytenr(fs_info);
-	mutex_unlock(&fs_info->reloc_mutex);
 
 	if (logical == U64_MAX) {
 		btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
@@ -401,28 +399,6 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
 static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
 						 u64 offset, u64 bytes)
 {
-	pgoff_t index = offset >> PAGE_SHIFT;
-	const pgoff_t end_index = (offset + bytes - 1) >> PAGE_SHIFT;
-	struct folio *folio;
-
-	while (index <= end_index) {
-		folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
-		if (IS_ERR(folio)) {
-			index++;
-			continue;
-		}
-
-		index = folio_next_index(folio);
-		/*
-		 * Here we just clear all Ordered bits for every page in the
-		 * range, then btrfs_mark_ordered_io_finished() will handle
-		 * the ordered extent accounting for the range.
-		 */
-		btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio,
-						offset, bytes);
-		folio_put(folio);
-	}
-
 	return btrfs_mark_ordered_io_finished(inode, offset, bytes, false);
 }
 
@@ -755,7 +731,7 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 
-	if (!btrfs_inode_can_compress(inode)) {
+	if (unlikely(!btrfs_inode_can_compress(inode))) {
 		DEBUG_WARN("BTRFS: unexpected compression for ino %llu", btrfs_ino(inode));
 		return 0;
 	}
@@ -842,7 +818,7 @@ static struct folio *compressed_bio_last_folio(struct compressed_bio *cb)
 	ASSERT(bio->bi_vcnt);
 
 	bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
-	paddr = page_to_phys(bvec->bv_page) + bvec->bv_offset + bvec->bv_len - 1;
+	paddr = bvec_phys(bvec) + bvec->bv_len - 1;
 	return page_folio(phys_to_page(paddr));
 }
 
@@ -1406,7 +1382,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
 	 * setup for writepage.
 	 */
 	page_ops = ((flags & COW_FILE_RANGE_KEEP_LOCKED) ? 0 : PAGE_UNLOCK);
-	page_ops |= PAGE_SET_ORDERED;
 
 	/*
 	 * Relocation relies on the relocated extents to have exactly the same
@@ -1972,8 +1947,7 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio
 		goto error;
 	extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
 				     EXTENT_LOCKED | EXTENT_DELALLOC |
-				     EXTENT_CLEAR_DATA_RESV,
-				     PAGE_SET_ORDERED);
+				     EXTENT_CLEAR_DATA_RESV, 0);
 	return ret;
 
 error:
@@ -2317,7 +2291,7 @@ error:
 static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
 {
 	if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
-		if (inode->defrag_bytes &&
+		if (data_race(inode->defrag_bytes) &&
 		    btrfs_test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
 			return false;
 		return true;
@@ -2605,8 +2579,7 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
 
 	lockdep_assert_held(&inode->io_tree.lock);
 
-	if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
-		WARN_ON(1);
+	WARN_ON((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC));
 	/*
 	 * set_bit and clear bit hooks normally require _irqsave/restore
 	 * but in this case, we are only testing for the DELALLOC
@@ -2810,7 +2783,13 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
 			      unsigned int extra_bits,
 			      struct extent_state **cached_state)
 {
-	WARN_ON(PAGE_ALIGNED(end));
+	const u32 blocksize = inode->root->fs_info->sectorsize;
+
+	/* Basic alignment check. */
+	ASSERT(IS_ALIGNED(start, blocksize), "start=%llu blocksize=%u",
+	       start, blocksize);
+	ASSERT(IS_ALIGNED(end + 1, blocksize), "inclusive end=%llu blocksize=%u",
+	       end, blocksize);
 
 	if (start >= i_size_read(&inode->vfs_inode) &&
 	    !(inode->flags & BTRFS_INODE_PREALLOC)) {
@@ -2833,206 +2812,50 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
 				    EXTENT_DELALLOC | extra_bits, cached_state);
 }
 
-/* see btrfs_writepage_start_hook for details on why this is required */
-struct btrfs_writepage_fixup {
-	struct folio *folio;
-	struct btrfs_inode *inode;
-	struct btrfs_work work;
-};
-
-static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
-{
-	struct btrfs_writepage_fixup *fixup =
-		container_of(work, struct btrfs_writepage_fixup, work);
-	struct btrfs_ordered_extent *ordered;
-	struct extent_state *cached_state = NULL;
-	struct extent_changeset *data_reserved = NULL;
-	struct folio *folio = fixup->folio;
-	struct btrfs_inode *inode = fixup->inode;
-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-	u64 page_start = folio_pos(folio);
-	u64 page_end = folio_next_pos(folio) - 1;
-	int ret = 0;
-	bool free_delalloc_space = true;
-
-	/*
-	 * This is similar to page_mkwrite, we need to reserve the space before
-	 * we take the folio lock.
-	 */
-	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
-					   folio_size(folio));
-again:
-	folio_lock(folio);
-
-	/*
-	 * Before we queued this fixup, we took a reference on the folio.
-	 * folio->mapping may go NULL, but it shouldn't be moved to a different
-	 * address space.
-	 */
-	if (!folio->mapping || !folio_test_dirty(folio) ||
-	    !folio_test_checked(folio)) {
-		/*
-		 * Unfortunately this is a little tricky, either
-		 *
-		 * 1) We got here and our folio had already been dealt with and
-		 *    we reserved our space, thus ret == 0, so we need to just
-		 *    drop our space reservation and bail.  This can happen the
-		 *    first time we come into the fixup worker, or could happen
-		 *    while waiting for the ordered extent.
-		 * 2) Our folio was already dealt with, but we happened to get an
-		 *    ENOSPC above from the btrfs_delalloc_reserve_space.  In
-		 *    this case we obviously don't have anything to release, but
-		 *    because the folio was already dealt with we don't want to
-		 *    mark the folio with an error, so make sure we're resetting
-		 *    ret to 0.  This is why we have this check _before_ the ret
-		 *    check, because we do not want to have a surprise ENOSPC
-		 *    when the folio was already properly dealt with.
-		 */
-		if (!ret) {
-			btrfs_delalloc_release_extents(inode, folio_size(folio));
-			btrfs_delalloc_release_space(inode, data_reserved,
-						     page_start, folio_size(folio),
-						     true);
-		}
-		ret = 0;
-		goto out_page;
-	}
-
-	/*
-	 * We can't mess with the folio state unless it is locked, so now that
-	 * it is locked bail if we failed to make our space reservation.
-	 */
-	if (ret)
-		goto out_page;
-
-	btrfs_lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
-
-	/* already ordered? We're done */
-	if (folio_test_ordered(folio))
-		goto out_reserved;
-
-	ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
-	if (ordered) {
-		btrfs_unlock_extent(&inode->io_tree, page_start, page_end,
-				    &cached_state);
-		folio_unlock(folio);
-		btrfs_start_ordered_extent(ordered);
-		btrfs_put_ordered_extent(ordered);
-		goto again;
-	}
-
-	ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
-					&cached_state);
-	if (ret)
-		goto out_reserved;
-
-	/*
-	 * Everything went as planned, we're now the owner of a dirty page with
-	 * delayed allocation bits set and space reserved for our COW
-	 * destination.
-	 *
-	 * The page was dirty when we started, nothing should have cleaned it.
-	 */
-	BUG_ON(!folio_test_dirty(folio));
-	free_delalloc_space = false;
-out_reserved:
-	btrfs_delalloc_release_extents(inode, PAGE_SIZE);
-	if (free_delalloc_space)
-		btrfs_delalloc_release_space(inode, data_reserved, page_start,
-					     PAGE_SIZE, true);
-	btrfs_unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
-out_page:
-	if (ret) {
-		/*
-		 * We hit ENOSPC or other errors.  Update the mapping and page
-		 * to reflect the errors and clean the page.
-		 */
-		mapping_set_error(folio->mapping, ret);
-		btrfs_folio_clear_ordered(fs_info, folio, page_start,
-					  folio_size(folio));
-		btrfs_mark_ordered_io_finished(inode, page_start,
-					       folio_size(folio), !ret);
-		folio_clear_dirty_for_io(folio);
-	}
-	btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
-	folio_unlock(folio);
-	folio_put(folio);
-	kfree(fixup);
-	extent_changeset_free(data_reserved);
-	/*
-	 * As a precaution, do a delayed iput in case it would be the last iput
-	 * that could need flushing space. Recursing back to fixup worker would
-	 * deadlock.
-	 */
-	btrfs_add_delayed_iput(inode);
-}
-
 /*
- * There are a few paths in the higher layers of the kernel that directly
- * set the folio dirty bit without asking the filesystem if it is a
- * good idea.  This causes problems because we want to make sure COW
- * properly happens and the data=ordered rules are followed.
+ * Clear the old accounting flags and set EXTENT_DELALLOC for the range.
  *
- * In our case any range that doesn't have the ORDERED bit set
- * hasn't been properly setup for IO.  We kick off an async process
- * to fix it up.  The async helper will wait for ordered extents, set
- * the delalloc bit and make it safe to write the folio.
+ * Return <0 for error, in that case no range has EXTENT_DELALLOC bit cleared or set.
  */
-int btrfs_writepage_cow_fixup(struct folio *folio)
+int btrfs_reset_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
+				unsigned int extra_bits, struct extent_state **cached_state)
 {
-	struct inode *inode = folio->mapping->host;
-	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
-	struct btrfs_writepage_fixup *fixup;
+	const u32 blocksize = inode->root->fs_info->sectorsize;
 
-	/* This folio has ordered extent covering it already */
-	if (folio_test_ordered(folio))
-		return 0;
-
-	/*
-	 * For experimental build, we error out instead of EAGAIN.
-	 *
-	 * We should not hit such out-of-band dirty folios anymore.
-	 */
-	if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
-		DEBUG_WARN();
-		btrfs_err_rl(fs_info,
-	"root %lld ino %llu folio %llu is marked dirty without notifying the fs",
-			     btrfs_root_id(BTRFS_I(inode)->root),
-			     btrfs_ino(BTRFS_I(inode)),
-			     folio_pos(folio));
-		return -EUCLEAN;
-	}
-
-	/*
-	 * folio_checked is set below when we create a fixup worker for this
-	 * folio, don't try to create another one if we're already
-	 * folio_test_checked.
-	 *
-	 * The extent_io writepage code will redirty the foio if we send back
-	 * EAGAIN.
-	 */
-	if (folio_test_checked(folio))
-		return -EAGAIN;
+	/* The @extra_bits can only be EXTENT_NORESERVE for now. */
+	ASSERT(!(extra_bits & ~EXTENT_NORESERVE), "extra_bits=0x%x", extra_bits);
 
-	fixup = kzalloc_obj(*fixup, GFP_NOFS);
-	if (!fixup)
-		return -EAGAIN;
+	/* Basic alignment check. */
+	ASSERT(IS_ALIGNED(start, blocksize), "start=%llu blocksize=%u",
+	       start, blocksize);
+	ASSERT(IS_ALIGNED(end + 1, blocksize), "inclusive end=%llu blocksize=%u",
+	       end, blocksize);
 
 	/*
-	 * We are already holding a reference to this inode from
-	 * write_cache_pages.  We need to hold it because the space reservation
-	 * takes place outside of the folio lock, and we can't trust
-	 * folio->mapping outside of the folio lock.
+	 * Check and set DELALLOC_NEW flag, this needs to search tree thus can
+	 * fail early.  Thus we want to do this before clearing EXTENT_DELALLOC.
 	 */
-	ihold(inode);
-	btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
-	folio_get(folio);
-	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
-	fixup->folio = folio;
-	fixup->inode = BTRFS_I(inode);
-	btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
+	if (start >= i_size_read(&inode->vfs_inode) &&
+	    !(inode->flags & BTRFS_INODE_PREALLOC)) {
+		/*
+		 * There can't be any extents following EOF in this case so just
+		 * set the delalloc new bit for the range directly.
+		 */
+		extra_bits |= EXTENT_DELALLOC_NEW;
+	} else {
+		int ret;
 
-	return -EAGAIN;
+		ret = btrfs_find_new_delalloc_bytes(inode, start, end + 1 - start,
+						    NULL);
+		if (unlikely(ret))
+			return ret;
+	}
+	/* Clear the old accounting as the range may already be dirty. */
+	btrfs_clear_extent_bit(&inode->io_tree, start, end,
+			       EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+			       EXTENT_DEFRAG, cached_state);
+	return btrfs_set_extent_bit(&inode->io_tree, start, end,
+				    EXTENT_DELALLOC | extra_bits, cached_state);
 }
 
 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
@@ -3798,7 +3621,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 
 		if (!inode && root == fs_info->tree_root) {
 			struct btrfs_root *dead_root;
-			int is_dead_root = 0;
+			bool is_dead_root = false;
 
 			/*
 			 * This is an orphan in the tree root. Currently these
@@ -3820,7 +3643,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 			dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
 							 (unsigned long)found_key.objectid);
 			if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
-				is_dead_root = 1;
+				is_dead_root = true;
 			spin_unlock(&fs_info->fs_roots_radix_lock);
 
 			if (is_dead_root) {
@@ -5181,12 +5004,7 @@ again:
 		goto again;
 	}
 
-	btrfs_clear_extent_bit(&inode->io_tree, block_start, block_end,
-			       EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
-			       &cached_state);
-
-	ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
-					&cached_state);
+	ret = btrfs_reset_extent_delalloc(inode, block_start, block_end, 0, &cached_state);
 	if (ret) {
 		btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
 		goto out_unlock;
@@ -5211,8 +5029,6 @@ again:
 	folio_zero_range(folio, zero_start - folio_pos(folio),
 			 zero_end - zero_start + 1);
 
-	btrfs_folio_clear_checked(fs_info, folio, block_start,
-				  block_end + 1 - block_start);
 	btrfs_folio_set_dirty(fs_info, folio, block_start,
 			      block_end + 1 - block_start);
 
@@ -7657,12 +7473,6 @@ static int btrfs_migrate_folio(struct address_space *mapping,
 
 	if (ret)
 		return ret;
-
-	if (folio_test_ordered(src)) {
-		folio_clear_ordered(src);
-		folio_set_ordered(dst);
-	}
-
 	return 0;
 }
 #else
@@ -7751,18 +7561,20 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
 				page_end);
 		ASSERT(range_end + 1 - cur < U32_MAX);
 		range_len = range_end + 1 - cur;
-		if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) {
-			/*
-			 * If Ordered is cleared, it means endio has
-			 * already been executed for the range.
-			 * We can't delete the extent states as
-			 * btrfs_finish_ordered_io() may still use some of them.
-			 */
+		/*
+		 * If the range is not dirty, the range has been submitted and
+		 * since we have waited for the writeback, endio has been
+		 * executed, thus we must skip the range to avoid double
+		 * accounting for the ordered extent.
+		 */
+		if (!btrfs_folio_test_dirty(fs_info, folio, cur, range_len))
 			goto next;
-		}
-		btrfs_folio_clear_ordered(fs_info, folio, cur, range_len);
 
 		/*
+		 * The range is dirty meaning it has not been submitted.
+		 * Here we need to truncate the OE range as the range will never
+		 * be submitted.
+		 *
 		 * IO on this page will never be started, so we need to account
 		 * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
 		 * here, must leave that up for the ordered extent completion.
@@ -7776,11 +7588,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
 					       EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
 					       EXTENT_DEFRAG, &cached_state);
 
-		spin_lock(&inode->ordered_tree_lock);
-		set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
-		ordered->truncated_len = min(ordered->truncated_len,
-					     cur - ordered->file_offset);
-		spin_unlock(&inode->ordered_tree_lock);
+		btrfs_mark_ordered_extent_truncated(ordered, cur - ordered->file_offset);
 
 		/*
 		 * If the ordered extent has finished, we're safe to delete all
@@ -7823,13 +7631,8 @@ next:
 					       &cached_state);
 		cur = range_end + 1;
 	}
-	/*
-	 * We have iterated through all ordered extents of the page, the page
-	 * should not have Ordered anymore, or the above iteration
-	 * did something wrong.
-	 */
-	ASSERT(!folio_test_ordered(folio));
-	btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
+	btrfs_folio_clear_dirty(fs_info, folio, page_start, folio_size(folio));
+	btrfs_clear_folio_dirty_tag(folio);
 	if (!inode_evicting)
 		__btrfs_release_folio(folio, GFP_NOFS);
 	clear_folio_extent_mapped(folio);
@@ -9687,7 +9490,7 @@ ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
 	pages = kzalloc_objs(struct page *, nr_pages, GFP_NOFS);
 	if (!pages)
 		return -ENOMEM;
-	ret = btrfs_alloc_page_array(nr_pages, pages, false);
+	ret = btrfs_alloc_page_array(nr_pages, pages, GFP_NOFS);
 	if (ret) {
 		ret = -ENOMEM;
 		goto out;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a39460bf68a7..9d47d16394fc 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -56,6 +56,7 @@
 #include "uuid-tree.h"
 #include "ioctl.h"
 #include "file.h"
+#include "file-item.h"
 #include "scrub.h"
 #include "super.h"
 
@@ -82,6 +83,30 @@ struct btrfs_ioctl_received_subvol_args_32 {
 
 #define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
 				struct btrfs_ioctl_received_subvol_args_32)
+
+struct btrfs_ioctl_get_subvol_info_args_32 {
+	__u64 treeid;
+	char name[BTRFS_VOL_NAME_MAX + 1];
+	__u64 parent_id;
+	__u64 dirid;
+	__u64 generation;
+	__u64 flags;
+	__u8 uuid[BTRFS_UUID_SIZE];
+	__u8 parent_uuid[BTRFS_UUID_SIZE];
+	__u8 received_uuid[BTRFS_UUID_SIZE];
+	__u64 ctransid;
+	__u64 otransid;
+	__u64 stransid;
+	__u64 rtransid;
+	struct btrfs_ioctl_timespec_32 ctime;
+	struct btrfs_ioctl_timespec_32 otime;
+	struct btrfs_ioctl_timespec_32 stime;
+	struct btrfs_ioctl_timespec_32 rtime;
+	__u64 reserved[8];
+} __attribute__ ((__packed__));
+
+#define BTRFS_IOC_GET_SUBVOL_INFO_32 _IOR(BTRFS_IOCTL_MAGIC, 60, \
+				struct btrfs_ioctl_get_subvol_info_args_32)
 #endif
 
 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
@@ -707,7 +732,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 {
 	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
 	struct inode *inode;
-	struct btrfs_pending_snapshot *pending_snapshot;
+	struct btrfs_pending_snapshot AUTO_KFREE(pending_snapshot);
 	unsigned int trans_num_items;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_block_rsv *block_rsv;
@@ -816,7 +841,6 @@ free_pending:
 		free_anon_bdev(pending_snapshot->anon_dev);
 	kfree(pending_snapshot->root_item);
 	btrfs_free_path(pending_snapshot->path);
-	kfree(pending_snapshot);
 
 	return ret;
 }
@@ -961,7 +985,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 	u64 new_size;
 	u64 old_size;
 	u64 devid = 1;
-	struct btrfs_ioctl_vol_args *vol_args;
+	struct btrfs_ioctl_vol_args AUTO_KFREE(vol_args);
 	struct btrfs_device *device = NULL;
 	char *sizestr;
 	char *devstr = NULL;
@@ -987,13 +1011,13 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 	}
 	ret = btrfs_check_ioctl_vol_args_path(vol_args);
 	if (ret < 0)
-		goto out_free;
+		goto out_drop;
 
 	sizestr = vol_args->name;
 	cancel = (strcmp("cancel", sizestr) == 0);
 	ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel);
 	if (ret)
-		goto out_free;
+		goto out_drop;
 	/* Exclusive operation is now claimed */
 
 	devstr = strchr(sizestr, ':');
@@ -1100,8 +1124,6 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 			old_size, new_size);
 out_finish:
 	btrfs_exclop_finish(fs_info);
-out_free:
-	kfree(vol_args);
 out_drop:
 	mnt_drop_write_file(file);
 	return ret;
@@ -1114,7 +1136,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
 				struct btrfs_qgroup_inherit *inherit)
 {
 	int ret;
-	struct qstr qname = QSTR_INIT(name, strlen(name));
+	struct qstr qname = QSTR(name);
 
 	if (!S_ISDIR(file_inode(file)->i_mode))
 		return -ENOTDIR;
@@ -1179,7 +1201,7 @@ out_drop_write:
 static noinline int btrfs_ioctl_snap_create(struct file *file,
 					    void __user *arg, bool subvol)
 {
-	struct btrfs_ioctl_vol_args *vol_args;
+	struct btrfs_ioctl_vol_args AUTO_KFREE(vol_args);
 	int ret;
 
 	if (!S_ISDIR(file_inode(file)->i_mode))
@@ -1190,24 +1212,20 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 		return PTR_ERR(vol_args);
 	ret = btrfs_check_ioctl_vol_args_path(vol_args);
 	if (ret < 0)
-		goto out;
-
-	ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file),
-					vol_args->name, vol_args->fd, subvol,
-					false, NULL);
+		return ret;
 
-out:
-	kfree(vol_args);
-	return ret;
+	return __btrfs_ioctl_snap_create(file, file_mnt_idmap(file),
+					 vol_args->name, vol_args->fd, subvol,
+					 false, NULL);
 }
 
 static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
 					       void __user *arg, bool subvol)
 {
-	struct btrfs_ioctl_vol_args_v2 *vol_args;
+	struct btrfs_ioctl_vol_args_v2 AUTO_KFREE(vol_args);
+	struct btrfs_qgroup_inherit AUTO_KFREE(inherit);
 	int ret;
 	bool readonly = false;
-	struct btrfs_qgroup_inherit *inherit = NULL;
 
 	if (!S_ISDIR(file_inode(file)->i_mode))
 		return -ENOTDIR;
@@ -1217,44 +1235,32 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
 		return PTR_ERR(vol_args);
 	ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args);
 	if (ret < 0)
-		goto free_args;
+		return ret;
 
-	if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) {
-		ret = -EOPNOTSUPP;
-		goto free_args;
-	}
+	if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK)
+		return -EOPNOTSUPP;
 
 	if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
 		readonly = true;
 	if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
 		struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file));
 
-		if (vol_args->size < sizeof(*inherit) ||
-		    vol_args->size > PAGE_SIZE) {
-			ret = -EINVAL;
-			goto free_args;
-		}
+		if (vol_args->size < sizeof(*inherit) || vol_args->size > PAGE_SIZE)
+			return -EINVAL;
+
 		inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
 		if (IS_ERR(inherit)) {
-			ret = PTR_ERR(inherit);
-			goto free_args;
+			return PTR_ERR(inherit);
 		}
 
 		ret = btrfs_qgroup_check_inherit(fs_info, inherit, vol_args->size);
 		if (ret < 0)
-			goto free_inherit;
+			return ret;
 	}
 
-	ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file),
-					vol_args->name, vol_args->fd, subvol,
-					readonly, inherit);
-	if (ret)
-		goto free_inherit;
-free_inherit:
-	kfree(inherit);
-free_args:
-	kfree(vol_args);
-	return ret;
+	return __btrfs_ioctl_snap_create(file, file_mnt_idmap(file),
+					 vol_args->name, vol_args->fd, subvol,
+					 readonly, inherit);
 }
 
 static noinline int btrfs_ioctl_subvol_getflags(struct btrfs_inode *inode,
@@ -1865,7 +1871,7 @@ out_put:
 static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root,
 					   void __user *argp)
 {
-	struct btrfs_ioctl_ino_lookup_args *args;
+	struct btrfs_ioctl_ino_lookup_args AUTO_KFREE(args);
 	int ret = 0;
 
 	args = memdup_user(argp, sizeof(*args));
@@ -1895,9 +1901,8 @@ static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root,
 
 out:
 	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
-		ret = -EFAULT;
+		return -EFAULT;
 
-	kfree(args);
 	return ret;
 }
 
@@ -1915,7 +1920,7 @@ out:
  */
 static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
 {
-	struct btrfs_ioctl_ino_lookup_user_args *args;
+	struct btrfs_ioctl_ino_lookup_user_args AUTO_KFREE(args);
 	struct inode *inode;
 	int ret;
 
@@ -1931,7 +1936,6 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
 		 * The subvolume does not exist under fd with which this is
 		 * called
 		 */
-		kfree(args);
 		return -EACCES;
 	}
 
@@ -1940,14 +1944,13 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
 	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
 		ret = -EFAULT;
 
-	kfree(args);
 	return ret;
 }
 
 /* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */
-static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
+static int _btrfs_ioctl_get_subvol_info(struct inode *inode,
+					struct btrfs_ioctl_get_subvol_info_args *subvol_info)
 {
-	struct btrfs_ioctl_get_subvol_info_args *subvol_info;
 	struct btrfs_fs_info *fs_info;
 	struct btrfs_root *root;
 	struct btrfs_path *path;
@@ -1956,7 +1959,6 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
 	struct btrfs_root_ref *rref;
 	struct extent_buffer *leaf;
 	unsigned long item_off;
-	unsigned long item_len;
 	int slot;
 	int ret = 0;
 
@@ -1964,12 +1966,6 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
 	if (!path)
 		return -ENOMEM;
 
-	subvol_info = kzalloc_obj(*subvol_info);
-	if (!subvol_info) {
-		btrfs_free_path(path);
-		return -ENOMEM;
-	}
-
 	fs_info = BTRFS_I(inode)->root->fs_info;
 
 	/* Get root_item of inode's subvolume */
@@ -2031,33 +2027,91 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
 		btrfs_item_key_to_cpu(leaf, &key, slot);
 		if (key.objectid == subvol_info->treeid &&
 		    key.type == BTRFS_ROOT_BACKREF_KEY) {
+			u16 name_len;
+
 			subvol_info->parent_id = key.offset;
 
 			rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
+			name_len = btrfs_root_ref_name_len(leaf, rref);
 			subvol_info->dirid = btrfs_root_ref_dirid(leaf, rref);
 
-			item_off = btrfs_item_ptr_offset(leaf, slot)
-					+ sizeof(struct btrfs_root_ref);
-			item_len = btrfs_item_size(leaf, slot)
-					- sizeof(struct btrfs_root_ref);
+			item_off = btrfs_item_ptr_offset(leaf, slot) + sizeof(*rref);
 			read_extent_buffer(leaf, subvol_info->name,
-					   item_off, item_len);
+					   item_off, name_len);
 		} else {
 			ret = -ENOENT;
 			goto out;
 		}
 	}
 
-	btrfs_free_path(path);
-	path = NULL;
-	if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
-		ret = -EFAULT;
-
 out:
 	btrfs_put_root(root);
 out_free:
 	btrfs_free_path(path);
-	kfree(subvol_info);
+	return ret;
+}
+
+#ifdef CONFIG_64BIT
+static int btrfs_ioctl_get_subvol_info_32(struct inode *inode, void __user *argp)
+{
+	struct btrfs_ioctl_get_subvol_info_args AUTO_KFREE(subvol_info);
+	struct btrfs_ioctl_get_subvol_info_args_32 AUTO_KFREE(subvol_info_32);
+	int ret;
+
+	subvol_info = kzalloc_obj(*subvol_info);
+	if (!subvol_info)
+		return -ENOMEM;
+
+	subvol_info_32 = kzalloc_obj(*subvol_info_32);
+	if (!subvol_info_32)
+		return -ENOMEM;
+
+	ret = _btrfs_ioctl_get_subvol_info(inode, subvol_info);
+	if (ret)
+		return ret;
+
+	subvol_info_32->treeid = subvol_info->treeid;
+	memcpy(subvol_info_32->name, subvol_info->name, sizeof(subvol_info_32->name));
+	subvol_info_32->parent_id = subvol_info->parent_id;
+	subvol_info_32->dirid = subvol_info->dirid;
+	subvol_info_32->generation = subvol_info->generation;
+	subvol_info_32->flags = subvol_info->flags;
+	memcpy(subvol_info_32->uuid, subvol_info->uuid, BTRFS_UUID_SIZE);
+	memcpy(subvol_info_32->parent_uuid, subvol_info->parent_uuid, BTRFS_UUID_SIZE);
+	memcpy(subvol_info_32->received_uuid, subvol_info->received_uuid, BTRFS_UUID_SIZE);
+	subvol_info_32->ctransid = subvol_info->ctransid;
+	subvol_info_32->otransid = subvol_info->otransid;
+	subvol_info_32->stransid = subvol_info->stransid;
+	subvol_info_32->rtransid = subvol_info->rtransid;
+	subvol_info_32->ctime.sec = subvol_info->ctime.sec;
+	subvol_info_32->ctime.nsec = subvol_info->ctime.nsec;
+	subvol_info_32->otime.sec = subvol_info->otime.sec;
+	subvol_info_32->otime.nsec = subvol_info->otime.nsec;
+	subvol_info_32->stime.sec = subvol_info->stime.sec;
+	subvol_info_32->stime.nsec = subvol_info->stime.nsec;
+	subvol_info_32->rtime.sec = subvol_info->rtime.sec;
+	subvol_info_32->rtime.nsec = subvol_info->rtime.nsec;
+
+	if (copy_to_user(argp, subvol_info_32, sizeof(*subvol_info_32)))
+		ret = -EFAULT;
+
+	return ret;
+}
+#endif
+
+static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
+{
+	struct btrfs_ioctl_get_subvol_info_args AUTO_KFREE(subvol_info);
+	int ret;
+
+	subvol_info = kzalloc_obj(*subvol_info);
+	if (!subvol_info)
+		return -ENOMEM;
+
+	ret = _btrfs_ioctl_get_subvol_info(inode, subvol_info);
+	if (!ret && copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
+		ret = -EFAULT;
+
 	return ret;
 }
 
@@ -2068,7 +2122,7 @@ out_free:
 static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
 					  void __user *argp)
 {
-	struct btrfs_ioctl_get_subvol_rootref_args *rootrefs;
+	struct btrfs_ioctl_get_subvol_rootref_args AUTO_KFREE(rootrefs);
 	struct btrfs_root_ref *rref;
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -2151,8 +2205,6 @@ out:
 			ret = -EFAULT;
 	}
 
-	kfree(rootrefs);
-
 	return ret;
 }
 
@@ -2167,8 +2219,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 	struct inode *inode;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_root *dest = NULL;
-	struct btrfs_ioctl_vol_args *vol_args = NULL;
-	struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL;
+	struct btrfs_ioctl_vol_args AUTO_KFREE(vol_args);
+	struct btrfs_ioctl_vol_args_v2 AUTO_KFREE(vol_args2);
 	struct mnt_idmap *idmap = file_mnt_idmap(file);
 	char *subvol_name, *subvol_name_ptr = NULL;
 	int ret = 0;
@@ -2186,10 +2238,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 		if (IS_ERR(vol_args2))
 			return PTR_ERR(vol_args2);
 
-		if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) {
-			ret = -EOPNOTSUPP;
-			goto out;
-		}
+		if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK)
+			return -EOPNOTSUPP;
 
 		/*
 		 * If SPEC_BY_ID is not set, we are looking for the subvolume by
@@ -2198,23 +2248,21 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 		if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) {
 			ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args2);
 			if (ret < 0)
-				goto out;
+				return ret;
 			subvol_name = vol_args2->name;
 
 			ret = mnt_want_write_file(file);
 			if (ret)
-				goto out;
+				return ret;
 		} else {
 			struct inode *old_dir;
 
-			if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) {
-				ret = -EINVAL;
-				goto out;
-			}
+			if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID)
+				return -EINVAL;
 
 			ret = mnt_want_write_file(file);
 			if (ret)
-				goto out;
+				return ret;
 
 			dentry = btrfs_get_dentry(fs_info->sb,
 					BTRFS_FIRST_FREE_OBJECTID,
@@ -2284,13 +2332,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 
 		ret = btrfs_check_ioctl_vol_args_path(vol_args);
 		if (ret < 0)
-			goto out;
+			return ret;
 
 		subvol_name = vol_args->name;
 
 		ret = mnt_want_write_file(file);
 		if (ret)
-			goto out;
+			return ret;
 	}
 
 	if (strchr(subvol_name, '/') ||
@@ -2371,9 +2419,6 @@ free_parent:
 		dput(parent);
 out_drop_write:
 	mnt_drop_write_file(file);
-out:
-	kfree(vol_args2);
-	kfree(vol_args);
 	return ret;
 }
 
@@ -2461,7 +2506,7 @@ out:
 
 static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
 {
-	struct btrfs_ioctl_vol_args *vol_args;
+	struct btrfs_ioctl_vol_args AUTO_KFREE(vol_args);
 	bool restore_op = false;
 	int ret;
 
@@ -2501,15 +2546,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
 
 	ret = btrfs_check_ioctl_vol_args_path(vol_args);
 	if (ret < 0)
-		goto out_free;
+		goto out;
 
 	ret = btrfs_init_new_device(fs_info, vol_args->name);
 
 	if (!ret)
 		btrfs_info(fs_info, "disk added %s", vol_args->name);
 
-out_free:
-	kfree(vol_args);
 out:
 	if (restore_op)
 		btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
@@ -2523,7 +2566,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
 	BTRFS_DEV_LOOKUP_ARGS(args);
 	struct inode *inode = file_inode(file);
 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
-	struct btrfs_ioctl_vol_args_v2 *vol_args;
+	struct btrfs_ioctl_vol_args_v2 AUTO_KFREE(vol_args);
 	struct file *bdev_file = NULL;
 	int ret;
 	bool cancel = false;
@@ -2582,7 +2625,6 @@ err_drop:
 		bdev_fput(bdev_file);
 out:
 	btrfs_put_dev_args_from_path(&args);
-	kfree(vol_args);
 	return ret;
 }
 
@@ -2591,7 +2633,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 	BTRFS_DEV_LOOKUP_ARGS(args);
 	struct inode *inode = file_inode(file);
 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
-	struct btrfs_ioctl_vol_args *vol_args;
+	struct btrfs_ioctl_vol_args AUTO_KFREE(vol_args);
 	struct file *bdev_file = NULL;
 	int ret;
 	bool cancel = false;
@@ -2605,7 +2647,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 
 	ret = btrfs_check_ioctl_vol_args_path(vol_args);
 	if (ret < 0)
-		goto out_free;
+		return ret;
 
 	if (!strcmp("cancel", vol_args->name)) {
 		cancel = true;
@@ -2633,19 +2675,16 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 		bdev_fput(bdev_file);
 out:
 	btrfs_put_dev_args_from_path(&args);
-out_free:
-	kfree(vol_args);
 	return ret;
 }
 
 static long btrfs_ioctl_fs_info(const struct btrfs_fs_info *fs_info,
 				void __user *arg)
 {
-	struct btrfs_ioctl_fs_info_args *fi_args;
+	struct btrfs_ioctl_fs_info_args AUTO_KFREE(fi_args);
 	struct btrfs_device *device;
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	u64 flags_in;
-	int ret = 0;
 
 	fi_args = memdup_user(arg, sizeof(*fi_args));
 	if (IS_ERR(fi_args))
@@ -2686,17 +2725,16 @@ static long btrfs_ioctl_fs_info(const struct btrfs_fs_info *fs_info,
 	}
 
 	if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
-		ret = -EFAULT;
+		return -EFAULT;
 
-	kfree(fi_args);
-	return ret;
+	return 0;
 }
 
 static long btrfs_ioctl_dev_info(const struct btrfs_fs_info *fs_info,
 				 void __user *arg)
 {
 	BTRFS_DEV_LOOKUP_ARGS(args);
-	struct btrfs_ioctl_dev_info_args *di_args;
+	struct btrfs_ioctl_dev_info_args AUTO_KFREE(di_args);
 	struct btrfs_device *dev;
 	int ret = 0;
 
@@ -2730,7 +2768,6 @@ out:
 	if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
 		ret = -EFAULT;
 
-	kfree(di_args);
 	return ret;
 }
 
@@ -2792,9 +2829,13 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 	if (IS_ERR_OR_NULL(di)) {
 		btrfs_release_path(path);
 		btrfs_end_transaction(trans);
+		if (di)
+			ret = PTR_ERR(di);
+		else
+			ret = -ENOENT;
 		btrfs_err(fs_info,
-			  "Umm, you don't have the default diritem, this isn't going to work");
-		ret = -ENOENT;
+			  "could not find default diritem for dir %llu: %d",
+			  dir_id, ret);
 		goto out_free;
 	}
 
@@ -3011,7 +3052,7 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info,
 static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
 {
 	struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file));
-	struct btrfs_ioctl_scrub_args *sa;
+	struct btrfs_ioctl_scrub_args AUTO_KFREE(sa);
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -3026,15 +3067,13 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
 	if (IS_ERR(sa))
 		return PTR_ERR(sa);
 
-	if (sa->flags & ~BTRFS_SCRUB_SUPPORTED_FLAGS) {
-		ret = -EOPNOTSUPP;
-		goto out;
-	}
+	if (sa->flags & ~BTRFS_SCRUB_SUPPORTED_FLAGS)
+		return -EOPNOTSUPP;
 
 	if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
 		ret = mnt_want_write_file(file);
 		if (ret)
-			goto out;
+			return ret;
 	}
 
 	ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end,
@@ -3058,8 +3097,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
 
 	if (!(sa->flags & BTRFS_SCRUB_READONLY))
 		mnt_drop_write_file(file);
-out:
-	kfree(sa);
+
 	return ret;
 }
 
@@ -3074,7 +3112,7 @@ static long btrfs_ioctl_scrub_cancel(struct btrfs_fs_info *fs_info)
 static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info,
 				       void __user *arg)
 {
-	struct btrfs_ioctl_scrub_args *sa;
+	struct btrfs_ioctl_scrub_args AUTO_KFREE(sa);
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -3087,40 +3125,36 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info,
 	ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress);
 
 	if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
-		ret = -EFAULT;
+		return -EFAULT;
 
-	kfree(sa);
 	return ret;
 }
 
 static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info,
 				      void __user *arg)
 {
-	struct btrfs_ioctl_get_dev_stats *sa;
+	struct btrfs_ioctl_get_dev_stats AUTO_KFREE(sa);
 	int ret;
 
 	sa = memdup_user(arg, sizeof(*sa));
 	if (IS_ERR(sa))
 		return PTR_ERR(sa);
 
-	if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) {
-		kfree(sa);
+	if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
-	}
 
 	ret = btrfs_get_dev_stats(fs_info, sa);
 
 	if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
-		ret = -EFAULT;
+		return -EFAULT;
 
-	kfree(sa);
 	return ret;
 }
 
 static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
 				    void __user *arg)
 {
-	struct btrfs_ioctl_dev_replace_args *p;
+	struct btrfs_ioctl_dev_replace_args AUTO_KFREE(p);
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -3137,10 +3171,8 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
 
 	switch (p->cmd) {
 	case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
-		if (sb_rdonly(fs_info->sb)) {
-			ret = -EROFS;
-			goto out;
-		}
+		if (sb_rdonly(fs_info->sb))
+			return -EROFS;
 		if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
 			ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
 		} else {
@@ -3162,9 +3194,8 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
 	}
 
 	if ((ret == 0 || ret == -ECANCELED) && copy_to_user(arg, p, sizeof(*p)))
-		ret = -EFAULT;
-out:
-	kfree(p);
+		return -EFAULT;
+
 	return ret;
 }
 
@@ -3174,7 +3205,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
 	int i;
 	u64 rel_ptr;
 	int size;
-	struct btrfs_ioctl_ino_path_args *ipa = NULL;
+	struct btrfs_ioctl_ino_path_args AUTO_KFREE(ipa);
 	struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL;
 	struct btrfs_path *path;
 
@@ -3223,7 +3254,6 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
 
 out:
 	btrfs_free_path(path);
-	kfree(ipa);
 
 	return ret;
 }
@@ -3233,8 +3263,8 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
 {
 	int ret = 0;
 	int size;
-	struct btrfs_ioctl_logical_ino_args *loi;
-	struct btrfs_data_container *inodes = NULL;
+	struct btrfs_ioctl_logical_ino_args AUTO_KFREE(loi);
+	struct btrfs_data_container AUTO_KVFREE(inodes);
 	bool ignore_offset;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -3249,41 +3279,32 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
 		size = min_t(u32, loi->size, SZ_64K);
 	} else {
 		/* All reserved bits must be 0 for now */
-		if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved))) {
-			ret = -EINVAL;
-			goto out_loi;
-		}
+		if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved)))
+			return -EINVAL;
+
 		/* Only accept flags we have defined so far */
-		if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) {
-			ret = -EINVAL;
-			goto out_loi;
-		}
+		if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET))
+			return -EINVAL;
+
 		ignore_offset = loi->flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET;
 		size = min_t(u32, loi->size, SZ_16M);
 	}
 
 	inodes = init_data_container(size);
-	if (IS_ERR(inodes)) {
-		ret = PTR_ERR(inodes);
-		goto out_loi;
-	}
+	if (IS_ERR(inodes))
+		return PTR_ERR(inodes);
 
 	ret = iterate_inodes_from_logical(loi->logical, fs_info, inodes, ignore_offset);
 	if (ret == -EINVAL)
-		ret = -ENOENT;
+		return -ENOENT;
 	if (ret < 0)
-		goto out;
+		return ret;
 
 	ret = copy_to_user((void __user *)(unsigned long)loi->inodes, inodes,
 			   size);
 	if (ret)
 		ret = -EFAULT;
 
-out:
-	kvfree(inodes);
-out_loi:
-	kfree(loi);
-
 	return ret;
 }
 
@@ -3380,7 +3401,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
 {
 	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_ioctl_balance_args *bargs;
+	struct btrfs_ioctl_balance_args AUTO_KFREE(bargs);
 	struct btrfs_balance_control *bctl;
 	bool need_unlock = true;
 	int ret;
@@ -3465,7 +3486,6 @@ out_unlock:
 		btrfs_exclop_finish(fs_info);
 out:
 	mnt_drop_write_file(file);
-	kfree(bargs);
 	return ret;
 }
 
@@ -3518,7 +3538,7 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
 {
 	struct inode *inode = file_inode(file);
 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
-	struct btrfs_ioctl_quota_ctl_args *sa;
+	struct btrfs_ioctl_quota_ctl_args AUTO_KFREE(sa);
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -3577,7 +3597,6 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
 		break;
 	}
 
-	kfree(sa);
 drop_write:
 	mnt_drop_write_file(file);
 	return ret;
@@ -3588,8 +3607,8 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
 	struct inode *inode = file_inode(file);
 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_ioctl_qgroup_assign_args *sa;
-	struct btrfs_qgroup_list *prealloc = NULL;
+	struct btrfs_ioctl_qgroup_assign_args AUTO_KFREE(sa);
+	struct btrfs_qgroup_list AUTO_KFREE(prealloc);
 	struct btrfs_trans_handle *trans;
 	int ret;
 	int err;
@@ -3614,7 +3633,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
 		prealloc = kzalloc_obj(*prealloc);
 		if (!prealloc) {
 			ret = -ENOMEM;
-			goto out;
+			goto drop_write;
 		}
 	}
 
@@ -3622,7 +3641,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
 	trans = btrfs_start_transaction(root, 2);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
-		goto out;
+		goto drop_write;
 	}
 
 	/*
@@ -3648,9 +3667,6 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
 	if (err && !ret)
 		ret = err;
 
-out:
-	kfree(prealloc);
-	kfree(sa);
 drop_write:
 	mnt_drop_write_file(file);
 	return ret;
@@ -3660,7 +3676,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
 {
 	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_ioctl_qgroup_create_args *sa;
+	struct btrfs_ioctl_qgroup_create_args AUTO_KFREE(sa);
 	struct btrfs_trans_handle *trans;
 	int ret;
 	int err;
@@ -3683,12 +3699,12 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
 
 	if (!sa->qgroupid) {
 		ret = -EINVAL;
-		goto out;
+		goto drop_write;
 	}
 
 	if (sa->create && btrfs_is_fstree(sa->qgroupid)) {
 		ret = -EINVAL;
-		goto out;
+		goto drop_write;
 	}
 
 	/*
@@ -3698,7 +3714,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
 	trans = btrfs_start_transaction(root, 2);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
-		goto out;
+		goto drop_write;
 	}
 
 	if (sa->create) {
@@ -3711,8 +3727,6 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
 	if (err && !ret)
 		ret = err;
 
-out:
-	kfree(sa);
 drop_write:
 	mnt_drop_write_file(file);
 	return ret;
@@ -3722,7 +3736,7 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
 {
 	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_ioctl_qgroup_limit_args *sa;
+	struct btrfs_ioctl_qgroup_limit_args AUTO_KFREE(sa);
 	struct btrfs_trans_handle *trans;
 	int ret;
 	int err;
@@ -3748,7 +3762,7 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
 	trans = btrfs_start_transaction(root, 1);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
-		goto out;
+		goto drop_write;
 	}
 
 	qgroupid = sa->qgroupid;
@@ -3763,8 +3777,6 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
 	if (err && !ret)
 		ret = err;
 
-out:
-	kfree(sa);
 drop_write:
 	mnt_drop_write_file(file);
 	return ret;
@@ -3774,7 +3786,7 @@ static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
 {
 	struct inode *inode = file_inode(file);
 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
-	struct btrfs_ioctl_quota_rescan_args *qsa;
+	struct btrfs_ioctl_quota_rescan_args AUTO_KFREE(qsa);
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -3795,13 +3807,11 @@ static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
 
 	if (qsa->flags) {
 		ret = -EINVAL;
-		goto out;
+		goto drop_write;
 	}
 
 	ret = btrfs_qgroup_rescan(fs_info);
 
-out:
-	kfree(qsa);
 drop_write:
 	mnt_drop_write_file(file);
 	return ret;
@@ -3946,8 +3956,8 @@ out:
 static long btrfs_ioctl_set_received_subvol_32(struct file *file,
 						void __user *arg)
 {
-	struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
-	struct btrfs_ioctl_received_subvol_args *args64 = NULL;
+	struct btrfs_ioctl_received_subvol_args_32 AUTO_KFREE(args32);
+	struct btrfs_ioctl_received_subvol_args AUTO_KFREE(args64);
 	int ret = 0;
 
 	args32 = memdup_user(arg, sizeof(*args32));
@@ -3955,10 +3965,8 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file,
 		return PTR_ERR(args32);
 
 	args64 = kmalloc_obj(*args64);
-	if (!args64) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	if (!args64)
+		return -ENOMEM;
 
 	memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
 	args64->stransid = args32->stransid;
@@ -3971,7 +3979,7 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file,
 
 	ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_idmap(file), args64);
 	if (ret)
-		goto out;
+		return ret;
 
 	memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
 	args32->stransid = args64->stransid;
@@ -3984,19 +3992,16 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file,
 
 	ret = copy_to_user(arg, args32, sizeof(*args32));
 	if (ret)
-		ret = -EFAULT;
+		return -EFAULT;
 
-out:
-	kfree(args32);
-	kfree(args64);
-	return ret;
+	return 0;
 }
 #endif
 
 static long btrfs_ioctl_set_received_subvol(struct file *file,
 					    void __user *arg)
 {
-	struct btrfs_ioctl_received_subvol_args *sa = NULL;
+	struct btrfs_ioctl_received_subvol_args AUTO_KFREE(sa);
 	int ret = 0;
 
 	sa = memdup_user(arg, sizeof(*sa));
@@ -4004,17 +4009,14 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
 		return PTR_ERR(sa);
 
 	ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_idmap(file), sa);
-
 	if (ret)
-		goto out;
+		return ret;
 
 	ret = copy_to_user(arg, sa, sizeof(*sa));
 	if (ret)
-		ret = -EFAULT;
+		return -EFAULT;
 
-out:
-	kfree(sa);
-	return ret;
+	return 0;
 }
 
 static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info,
@@ -4254,11 +4256,11 @@ out_drop_write:
 
 static int _btrfs_ioctl_send(struct btrfs_root *root, void __user *argp, bool compat)
 {
-	struct btrfs_ioctl_send_args *arg;
-	int ret;
+	struct btrfs_ioctl_send_args AUTO_KFREE(arg);
 
 	if (compat) {
 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+		int ret;
 		struct btrfs_ioctl_send_args_32 args32 = { 0 };
 
 		ret = copy_from_user(&args32, argp, sizeof(args32));
@@ -4283,9 +4285,7 @@ static int _btrfs_ioctl_send(struct btrfs_root *root, void __user *argp, bool co
 		if (IS_ERR(arg))
 			return PTR_ERR(arg);
 	}
-	ret = btrfs_ioctl_send(root, arg);
-	kfree(arg);
-	return ret;
+	return btrfs_ioctl_send(root, arg);
 }
 
 static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
@@ -4621,7 +4621,7 @@ static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
 	pages = kzalloc_objs(struct page *, nr_pages, GFP_NOFS);
 	if (!pages)
 		return -ENOMEM;
-	ret = btrfs_alloc_page_array(nr_pages, pages, 0);
+	ret = btrfs_alloc_page_array(nr_pages, pages, GFP_NOFS);
 	if (ret) {
 		ret = -ENOMEM;
 		goto out_fail;
@@ -5092,7 +5092,8 @@ static int btrfs_ioctl_subvol_sync(struct btrfs_fs_info *fs_info, void __user *a
 			return -ENOENT;
 
 		wait_for_deletion = true;
-		ASSERT(root_flags & BTRFS_ROOT_SUBVOL_DEAD);
+		ASSERT(root_flags & BTRFS_ROOT_SUBVOL_DEAD, "root_flags=0x%llx",
+		       root_flags);
 		sched_ret = schedule_timeout_interruptible(HZ);
 		/* Early wake up or error. */
 		if (sched_ret != 0)
@@ -5140,6 +5141,342 @@ static int btrfs_ioctl_shutdown(struct btrfs_fs_info *fs_info, unsigned long arg
 	return ret;
 }
 
+#define GET_CSUMS_BUF_MAX	SZ_16M
+
+static int copy_csums_to_user(struct btrfs_fs_info *fs_info, u64 disk_bytenr,
+			      u64 len, u8 __user *buf)
+{
+	struct btrfs_root *csum_root;
+	struct btrfs_ordered_sum *sums;
+	LIST_HEAD(list);
+	const u32 csum_size = fs_info->csum_size;
+	int ret;
+
+	csum_root = btrfs_csum_root(fs_info, disk_bytenr);
+	if (unlikely(!csum_root)) {
+		btrfs_err(fs_info, "missing csum root for extent at bytenr %llu", disk_bytenr);
+		return -EUCLEAN;
+	}
+
+	ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
+				      disk_bytenr + len - 1, &list, false);
+	if (ret < 0)
+		return ret;
+
+	ret = 0;
+	while (!list_empty(&list)) {
+		u64 offset;
+		size_t copy_size;
+
+		sums = list_first_entry(&list, struct btrfs_ordered_sum, list);
+		list_del(&sums->list);
+
+		offset = ((sums->logical - disk_bytenr) >> fs_info->sectorsize_bits) * csum_size;
+		copy_size = (sums->len >> fs_info->sectorsize_bits) * csum_size;
+
+		if (copy_to_user(buf + offset, sums->sums, copy_size)) {
+			kfree(sums);
+			ret = -EFAULT;
+			goto out;
+		}
+
+		kfree(sums);
+	}
+
+out:
+	while (!list_empty(&list)) {
+		sums = list_first_entry(&list, struct btrfs_ordered_sum, list);
+		list_del(&sums->list);
+		kfree(sums);
+	}
+	return ret;
+}
+
+static int btrfs_ioctl_get_csums(struct file *file, void __user *argp)
+{
+	struct inode *vfs_inode = file_inode(file);
+	struct btrfs_inode *inode = BTRFS_I(vfs_inode);
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	struct btrfs_root *root = inode->root;
+	struct btrfs_ioctl_get_csums_args args;
+	BTRFS_PATH_AUTO_FREE(path);
+	const u64 ino = btrfs_ino(inode);
+	const u32 csum_size = fs_info->csum_size;
+	u8 __user *ubuf;
+	u64 buf_limit;
+	u64 buf_used = 0;
+	u64 cur_offset;
+	u64 end_offset;
+	u64 prev_extent_end;
+	struct btrfs_key key;
+	int ret;
+
+	if (!(file->f_mode & FMODE_READ))
+		return -EBADF;
+
+	if (!S_ISREG(vfs_inode->i_mode))
+		return -EINVAL;
+
+	if (copy_from_user(&args, argp, sizeof(args)))
+		return -EFAULT;
+
+	if (!IS_ALIGNED(args.offset, fs_info->sectorsize) ||
+	    !IS_ALIGNED(args.length, fs_info->sectorsize))
+		return -EINVAL;
+	if (args.length == 0)
+		return -EINVAL;
+	if (args.offset + args.length < args.offset)
+		return -EOVERFLOW;
+	if (args.flags != 0)
+		return -EINVAL;
+	if (args.buf_size < sizeof(struct btrfs_ioctl_get_csums_entry))
+		return -EINVAL;
+
+	buf_limit = min_t(u64, args.buf_size, GET_CSUMS_BUF_MAX);
+	ubuf = (u8 __user *)(argp + offsetof(struct btrfs_ioctl_get_csums_args, buf));
+
+	if (clear_user(ubuf, buf_limit))
+		return -EFAULT;
+
+	cur_offset = args.offset;
+	end_offset = args.offset + args.length;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_wait_ordered_range(inode, cur_offset, args.length);
+	if (ret)
+		return ret;
+
+	ret = down_read_interruptible(&vfs_inode->i_rwsem);
+	if (ret)
+		return ret;
+
+	ret = btrfs_wait_ordered_range(inode, cur_offset, args.length);
+	if (ret)
+		goto out_unlock;
+
+	/* NODATASUM early exit. */
+	if (inode->flags & BTRFS_INODE_NODATASUM) {
+		struct btrfs_ioctl_get_csums_entry entry = {
+			.offset = cur_offset,
+			.length = end_offset - cur_offset,
+			.type = BTRFS_GET_CSUMS_NODATASUM,
+		};
+
+		if (copy_to_user(ubuf, &entry, sizeof(entry))) {
+			ret = -EFAULT;
+			goto out_unlock;
+		}
+
+		buf_used = sizeof(entry);
+		cur_offset = end_offset;
+		goto done;
+	}
+
+	prev_extent_end = cur_offset;
+
+	while (cur_offset < end_offset) {
+		struct btrfs_file_extent_item *ei;
+		struct extent_buffer *leaf;
+		struct btrfs_ioctl_get_csums_entry entry = { 0 };
+		u64 extent_end;
+		u64 disk_bytenr = 0;
+		u64 extent_offset = 0;
+		u64 range_start, range_len;
+		u64 entry_csum_size;
+		u64 key_offset;
+		int extent_type;
+		u8 compression;
+		u8 encryption;
+
+		/* Search for the extent at or before cur_offset. */
+		key.objectid = ino;
+		key.type = BTRFS_EXTENT_DATA_KEY;
+		key.offset = cur_offset;
+
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			goto out_unlock;
+
+		if (ret > 0 && path->slots[0] > 0) {
+			btrfs_item_key_to_cpu(path->nodes[0], &key,
+					      path->slots[0] - 1);
+			if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) {
+				path->slots[0]--;
+				if (btrfs_file_extent_end(path) <= cur_offset)
+					path->slots[0]++;
+			}
+		}
+
+		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out_unlock;
+			if (ret > 0) {
+				ret = 0;
+				btrfs_release_path(path);
+				break;
+			}
+		}
+
+		leaf = path->nodes[0];
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+			btrfs_release_path(path);
+			break;
+		}
+
+		extent_end = btrfs_file_extent_end(path);
+		key_offset = key.offset;
+
+		/* Read extent fields before releasing the path. */
+		ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+		extent_type = btrfs_file_extent_type(leaf, ei);
+		compression = btrfs_file_extent_compression(leaf, ei);
+		encryption = btrfs_file_extent_encryption(leaf, ei);
+
+		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+			if (disk_bytenr && compression == BTRFS_COMPRESS_NONE)
+				extent_offset = btrfs_file_extent_offset(leaf, ei);
+		}
+
+		btrfs_release_path(path);
+
+		/* Implicit hole (NO_HOLES feature). */
+		if (prev_extent_end < key_offset) {
+			u64 hole_end = min(key_offset, end_offset);
+			u64 hole_len = hole_end - prev_extent_end;
+
+			if (prev_extent_end >= cur_offset) {
+				entry.offset = prev_extent_end;
+				entry.length = hole_len;
+				entry.type = BTRFS_GET_CSUMS_ZEROED;
+
+				if (buf_used + sizeof(entry) > buf_limit)
+					goto done;
+				if (copy_to_user(ubuf + buf_used, &entry, sizeof(entry))) {
+					ret = -EFAULT;
+					goto out_unlock;
+				}
+				buf_used += sizeof(entry);
+				cur_offset = hole_end;
+			}
+
+			if (key_offset >= end_offset) {
+				cur_offset = end_offset;
+				break;
+			}
+		}
+
+		/* Clamp to our query range. */
+		range_start = max(cur_offset, key_offset);
+		range_len = min(extent_end, end_offset) - range_start;
+
+		entry.offset = range_start;
+		entry.length = range_len;
+
+		if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+			entry.type = BTRFS_GET_CSUMS_INLINE;
+			if (compression != BTRFS_COMPRESS_NONE)
+				entry.type |= BTRFS_GET_CSUMS_COMPRESSED;
+			if (encryption != 0)
+				entry.type |= BTRFS_GET_CSUMS_ENCRYPTED;
+			entry_csum_size = 0;
+		} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+			entry.type = BTRFS_GET_CSUMS_ZEROED;
+			entry_csum_size = 0;
+		} else {
+			/* BTRFS_FILE_EXTENT_REG */
+			if (disk_bytenr == 0) {
+				/* Explicit hole. */
+				entry.type = BTRFS_GET_CSUMS_ZEROED;
+				entry_csum_size = 0;
+			} else if (encryption != 0 || compression != BTRFS_COMPRESS_NONE) {
+				entry.type = 0;
+				if (encryption != 0)
+					entry.type |= BTRFS_GET_CSUMS_ENCRYPTED;
+				if (compression != BTRFS_COMPRESS_NONE)
+					entry.type |= BTRFS_GET_CSUMS_COMPRESSED;
+				entry_csum_size = 0;
+			} else {
+				entry.type = BTRFS_GET_CSUMS_HAS_CSUMS;
+				entry_csum_size = (range_len >> fs_info->sectorsize_bits) * csum_size;
+			}
+		}
+
+		/* Check if this entry (+ csum data) fits in the buffer. */
+		if (buf_used + sizeof(entry) + entry_csum_size > buf_limit) {
+			if (buf_used == 0) {
+				ret = -EOVERFLOW;
+				goto out_unlock;
+			}
+			goto done;
+		}
+
+		if (copy_to_user(ubuf + buf_used, &entry, sizeof(entry))) {
+			ret = -EFAULT;
+			goto out_unlock;
+		}
+		buf_used += sizeof(entry);
+
+		if (entry.type == BTRFS_GET_CSUMS_HAS_CSUMS) {
+			ret = copy_csums_to_user(fs_info,
+				disk_bytenr + extent_offset + (range_start - key_offset),
+				range_len, ubuf + buf_used);
+			if (ret)
+				goto out_unlock;
+			buf_used += entry_csum_size;
+		}
+
+		cur_offset = range_start + range_len;
+		prev_extent_end = extent_end;
+
+		if (fatal_signal_pending(current)) {
+			if (buf_used == 0) {
+				ret = -EINTR;
+				goto out_unlock;
+			}
+			goto done;
+		}
+
+		cond_resched();
+	}
+
+	/* Handle trailing implicit hole. */
+	if (cur_offset < end_offset) {
+		struct btrfs_ioctl_get_csums_entry entry = {
+			.offset = prev_extent_end,
+			.length = end_offset - prev_extent_end,
+			.type = BTRFS_GET_CSUMS_ZEROED,
+		};
+
+		if (buf_used + sizeof(entry) <= buf_limit) {
+			if (copy_to_user(ubuf + buf_used, &entry, sizeof(entry))) {
+				ret = -EFAULT;
+				goto out_unlock;
+			}
+			buf_used += sizeof(entry);
+			cur_offset = end_offset;
+		}
+	}
+
+done:
+	args.offset = cur_offset;
+	args.length = (cur_offset < end_offset) ? end_offset - cur_offset : 0;
+	args.buf_size = buf_used;
+
+	if (copy_to_user(argp, &args, sizeof(args)))
+		ret = -EFAULT;
+
+out_unlock:
+	up_read(&vfs_inode->i_rwsem);
+	return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -5273,6 +5610,10 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_set_features(file, argp);
 	case BTRFS_IOC_GET_SUBVOL_INFO:
 		return btrfs_ioctl_get_subvol_info(inode, argp);
+#ifdef CONFIG_64BIT
+	case BTRFS_IOC_GET_SUBVOL_INFO_32:
+		return btrfs_ioctl_get_subvol_info_32(inode, argp);
+#endif
 	case BTRFS_IOC_GET_SUBVOL_ROOTREF:
 		return btrfs_ioctl_get_subvol_rootref(root, argp);
 	case BTRFS_IOC_INO_LOOKUP_USER:
@@ -5297,6 +5638,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_subvol_sync(fs_info, argp);
 	case BTRFS_IOC_SHUTDOWN:
 		return btrfs_ioctl_shutdown(fs_info, arg);
+	case BTRFS_IOC_GET_CSUMS:
+		return btrfs_ioctl_get_csums(file, argp);
 	}
 
 	return -ENOTTY;
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 2de18c7b563a..6e4aa22853ab 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -491,6 +491,17 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 			return -EIO;
 		}
 
+		/* The segment must not extend beyond the compressed input. */
+		if (unlikely(cur_in + seg_len > compressed_len)) {
+			struct btrfs_inode *inode = cb->bbio.inode;
+
+			btrfs_err(fs_info,
+			"lzo segment overflows compressed input, root %llu inode %llu offset %llu cur_in %u len %u compressed len %u",
+				  btrfs_root_id(inode->root), btrfs_ino(inode),
+				  cb->start, cur_in, seg_len, compressed_len);
+			return -EUCLEAN;
+		}
+
 		/* Copy the compressed segment payload into workspace */
 		copy_compressed_segment(cb, &fi, &cur_folio_index, workspace->cbuf,
 					seg_len, &cur_in);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e5a24b3ff95e..b32d4eabe0ab 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -157,7 +157,8 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
 	       ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC)));
 
 	/* Only one type flag can be set. */
-	ASSERT(has_single_bit_set(flags & BTRFS_ORDERED_EXCLUSIVE_FLAGS));
+	ASSERT(has_single_bit_set(flags & BTRFS_ORDERED_EXCLUSIVE_FLAGS),
+	       "flags=0x%lx", flags);
 
 	/* DIRECT cannot be set with COMPRESSED nor ENCODED. */
 	if (test_bit(BTRFS_ORDERED_DIRECT, &flags)) {
@@ -302,7 +303,7 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
 {
 	struct btrfs_ordered_extent *entry;
 
-	ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
+	ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0, "flags=0x%lx", flags);
 
 	/*
 	 * For regular writes, we just use the members in @file_extent.
@@ -357,6 +358,18 @@ void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered)
 		mapping_set_error(ordered->inode->vfs_inode.i_mapping, -EIO);
 }
 
+void btrfs_mark_ordered_extent_truncated(struct btrfs_ordered_extent *ordered,
+					 u64 truncate_len)
+{
+	struct btrfs_inode *inode = ordered->inode;
+
+	ASSERT(truncate_len <= ordered->num_bytes);
+	spin_lock(&inode->ordered_tree_lock);
+	set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
+	ordered->truncated_len = min(ordered->truncated_len, truncate_len);
+	spin_unlock(&inode->ordered_tree_lock);
+}
+
 static void finish_ordered_fn(struct btrfs_work *work)
 {
 	struct btrfs_ordered_extent *ordered_extent;
@@ -1238,7 +1251,7 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
 
 	trace_btrfs_ordered_extent_split(inode, ordered);
 
-	ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED)));
+	ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED)), "flags=0x%lx", flags);
 
 	/*
 	 * The entire bio must be covered by the ordered extent, but we can't
@@ -1260,7 +1273,7 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
 	}
 	/* We cannot split partially completed ordered extents. */
 	if (ordered->bytes_left) {
-		ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS));
+		ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS), "flags=0x%lx", flags);
 		if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes))
 			return ERR_PTR(-EINVAL);
 	}
@@ -1307,7 +1320,8 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
 	ordered->ram_bytes -= len;
 
 	if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) {
-		ASSERT(ordered->bytes_left == 0);
+		ASSERT(ordered->bytes_left == 0, "ordered->bytes_left=%llu",
+		       ordered->bytes_left);
 		new->bytes_left = 0;
 	} else {
 		ordered->bytes_left -= len;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 03e12380a2fd..8d5d5ba1e02f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -226,6 +226,8 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
 struct btrfs_ordered_extent *btrfs_split_ordered_extent(
 			struct btrfs_ordered_extent *ordered, u64 len);
 void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered);
+void btrfs_mark_ordered_extent_truncated(struct btrfs_ordered_extent *ordered,
+					 u64 truncate_len);
 int __init ordered_data_init(void);
 void __cold ordered_data_exit(void);
 
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 6838faceb6d5..502fb4a55cb2 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -373,10 +373,9 @@ static bool squota_check_parent_usage(struct btrfs_fs_info *fs_info, struct btrf
 		    parent->excl_cmpr != excl_cmpr_sum || parent->rfer_cmpr != rfer_cmpr_sum);
 
 	WARN(mismatch,
-	     "parent squota qgroup %hu/%llu has mismatched usage from its %d members. "
+	     "parent squota qgroup " BTRFS_QGROUP_FMT " has mismatched usage from its %d members. "
 	     "%llu %llu %llu %llu vs %llu %llu %llu %llu\n",
-	     btrfs_qgroup_level(parent->qgroupid),
-	     btrfs_qgroup_subvolid(parent->qgroupid), nr_members, parent->excl,
+	     BTRFS_QGROUP_FMT_VALUE(parent), nr_members, parent->excl,
 	     parent->rfer, parent->excl_cmpr, parent->rfer_cmpr, excl_sum,
 	     rfer_sum, excl_cmpr_sum, rfer_cmpr_sum);
 	return mismatch;
@@ -652,9 +651,8 @@ bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info)
 			if (qgroup->rsv.values[i]) {
 				ret = true;
 				btrfs_warn(fs_info,
-		"qgroup %hu/%llu has unreleased space, type %d rsv %llu",
-				   btrfs_qgroup_level(qgroup->qgroupid),
-				   btrfs_qgroup_subvolid(qgroup->qgroupid),
+		"qgroup " BTRFS_QGROUP_FMT " has unreleased space, type %d rsv %llu",
+				   BTRFS_QGROUP_FMT_VALUE(qgroup),
 				   i, qgroup->rsv.values[i]);
 			}
 		}
@@ -1858,14 +1856,13 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
 	 * Thus its reserved space should all be zero, no matter if qgroup
 	 * is consistent or the mode.
 	 */
-	if (qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
-	    qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
-	    qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]) {
+	if (unlikely(qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
+		     qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
+		     qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS])) {
 		DEBUG_WARN();
 		btrfs_warn_rl(fs_info,
-"to be deleted qgroup %u/%llu has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu",
-			      btrfs_qgroup_level(qgroup->qgroupid),
-			      btrfs_qgroup_subvolid(qgroup->qgroupid),
+"to be deleted qgroup " BTRFS_QGROUP_FMT " has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu",
+			      BTRFS_QGROUP_FMT_VALUE(qgroup),
 			      qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA],
 			      qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC],
 			      qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
@@ -1879,13 +1876,12 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
 	 */
 	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL &&
 	    !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
-		if (qgroup->rfer || qgroup->excl ||
-		    qgroup->rfer_cmpr || qgroup->excl_cmpr) {
+		if (unlikely(qgroup->rfer || qgroup->excl ||
+			     qgroup->rfer_cmpr || qgroup->excl_cmpr)) {
 			DEBUG_WARN();
 			qgroup_mark_inconsistent(fs_info,
-				"to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
-				btrfs_qgroup_level(qgroup->qgroupid),
-				btrfs_qgroup_subvolid(qgroup->qgroupid),
+"to be deleted qgroup " BTRFS_QGROUP_FMT " has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
+				BTRFS_QGROUP_FMT_VALUE(qgroup),
 				qgroup->rfer, qgroup->rfer_cmpr,
 				qgroup->excl, qgroup->excl_cmpr);
 		}
@@ -4822,9 +4818,9 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
 
 		entry = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
 
-		if (entry->subvol_generation != block->subvol_generation ||
-		    entry->reloc_bytenr != block->reloc_bytenr ||
-		    entry->reloc_generation != block->reloc_generation) {
+		if (unlikely(entry->subvol_generation != block->subvol_generation ||
+			     entry->reloc_bytenr != block->reloc_bytenr ||
+			     entry->reloc_generation != block->reloc_generation)) {
 			/*
 			 * Duplicated but mismatch entry found.  Shouldn't happen.
 			 * Marking qgroup inconsistent should be enough for end
@@ -4971,9 +4967,8 @@ int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
 		ASSERT(qg->excl == qg->rfer);
 		if (WARN_ON_ONCE(sign < 0 && qg->excl < num_bytes)) {
 			btrfs_warn(fs_info,
-				   "squota underflow qg %hu/%llu excl %llu num_bytes %llu",
-				   btrfs_qgroup_level(qg->qgroupid),
-				   btrfs_qgroup_subvolid(qg->qgroupid),
+				   "squota underflow qg " BTRFS_QGROUP_FMT " excl %llu num_bytes %llu",
+				   BTRFS_QGROUP_FMT_VALUE(qg),
 				   qg->excl, num_bytes);
 			qg->excl = 0;
 			qg->rfer = 0;
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 4b0186c83ad1..454a95bf542a 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -272,7 +272,9 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
 								 &key,
 								 key.offset - length,
 								 length);
-			ASSERT(key.offset - diff_end == length);
+			ASSERT(key.offset - diff_end == length,
+			       "key.offset=%llu diff_end=%llu length=%llu",
+			       key.offset, diff_end, length);
 			break;
 		}
 
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 08ee8f316d96..f7f7db40994c 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -466,7 +466,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 	int bucket = rbio_bucket(rbio);
 	struct btrfs_stripe_hash_table *table;
 	struct btrfs_stripe_hash *h;
-	int freeit = 0;
+	bool freeit = false;
 
 	/*
 	 * check the bit again under the hash table lock.
@@ -491,7 +491,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 		list_del_init(&rbio->stripe_cache);
 		table->cache_size -= 1;
-		freeit = 1;
+		freeit = true;
 
 		/* if the bio list isn't empty, this rbio is
 		 * still involved in an IO.  We take it out
@@ -855,7 +855,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 {
 	int bucket;
 	struct btrfs_stripe_hash *h;
-	int keep_cache = 0;
+	bool keep_cache = false;
 
 	bucket = rbio_bucket(rbio);
 	h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
@@ -874,7 +874,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 		 */
 		if (list_empty(&rbio->plug_list) &&
 		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
-			keep_cache = 1;
+			keep_cache = true;
 			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
 			BUG_ON(!bio_list_empty(&rbio->bio_list));
 			goto done;
@@ -1123,7 +1123,7 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
 {
 	int ret;
 
-	ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, false);
+	ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, GFP_NOFS);
 	if (ret < 0)
 		return ret;
 	/* Mapping all sectors */
@@ -1138,7 +1138,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
 	int ret;
 
 	ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
-				     rbio->stripe_pages + data_pages, false);
+				     rbio->stripe_pages + data_pages, GFP_NOFS);
 	if (ret < 0)
 		return ret;
 
@@ -1732,7 +1732,7 @@ static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
 	const int data_pages = rbio->nr_data * rbio->stripe_npages;
 	int ret;
 
-	ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, false);
+	ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, GFP_NOFS);
 	if (ret < 0)
 		return ret;
 
@@ -2695,7 +2695,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
 	phys_addr_t p_paddr = INVALID_PADDR;
 	phys_addr_t q_paddr = INVALID_PADDR;
 	struct bio_list bio_list;
-	int is_replace = 0;
+	bool is_replace = false;
 	int ret;
 
 	bio_list_init(&bio_list);
@@ -2712,7 +2712,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
 	 * need to duplicate the final write to replace target.
 	 */
 	if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) {
-		is_replace = 1;
+		is_replace = true;
 		bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
 	}
 
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 49865a463780..9a49d2ecb949 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -69,7 +69,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
 	struct address_space *mapping = inode->vfs_inode.i_mapping;
 	int ret;
 
-	ASSERT(IS_ALIGNED(file_offset, block_size));
+	ASSERT(IS_ALIGNED(file_offset, block_size), "file_offset=%llu block_size=%u",
+	       file_offset, block_size);
 
 	/*
 	 * We have flushed and locked the ranges of the source and destination
@@ -94,9 +95,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
 	if (ret < 0)
 		goto out_unlock;
 
-	btrfs_clear_extent_bit(&inode->io_tree, file_offset, range_end,
-			       EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, NULL);
-	ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
+	ret = btrfs_reset_extent_delalloc(inode, file_offset, range_end, 0, NULL);
 	if (ret)
 		goto out_unlock;
 
@@ -141,7 +140,6 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
 		folio_zero_range(folio, datal, block_size - datal);
 
 	btrfs_folio_set_uptodate(fs_info, folio, file_offset, block_size);
-	btrfs_folio_clear_checked(fs_info, folio, file_offset, block_size);
 	btrfs_folio_set_dirty(fs_info, folio, file_offset, block_size);
 out_unlock:
 	if (!IS_ERR(folio)) {
@@ -181,10 +179,12 @@ static int clone_copy_inline_extent(struct btrfs_inode *inode,
 	struct btrfs_drop_extents_args drop_args = { 0 };
 	int ret;
 	struct btrfs_key key;
+	bool copied_inline_to_page = false;
 
 	if (new_key->offset > 0) {
 		ret = copy_inline_to_page(inode, new_key->offset,
 					  inline_data, size, datal, comp_type);
+		copied_inline_to_page = (ret == 0);
 		goto out;
 	}
 
@@ -290,6 +290,60 @@ copy_inline_extent:
 		btrfs_abort_transaction(trans, ret);
 out:
 	if (!ret && !trans) {
+		if (copied_inline_to_page &&
+		    new_key->offset + datal > i_size_read(&inode->vfs_inode)) {
+			/*
+			 * If we copied the inline extent data to a page/folio
+			 * beyond the i_size of the destination inode, then we
+			 * need to increase the i_size before we start a
+			 * transaction to update the inode item. This is to
+			 * prevent a deadlock when the flushoncommit mount
+			 * option is used, which happens like this:
+			 *
+			 * 1) Task A clones an inline extent from inode X to an
+			 *    offset of inode Y that is beyond Y's current
+			 *    i_size. This means we copied the inline extent's
+			 *    data to a folio of inode Y that is beyond its EOF,
+			 *    using the call above to copy_inline_to_page();
+			 *
+			 * 2) Task B starts a transaction commit and calls
+			 *    btrfs_start_delalloc_flush() to flush delalloc;
+			 *
+			 * 3) The delalloc flushing sees the new dirty folio of
+			 *    inode Y and when it attempts to flush it, it ends
+			 *    up at extent_writepage() and sees that the offset
+			 *    of the folio is beyond the i_size of inode Y, so
+			 *    it attempts to invalidate the folio by calling
+			 *    folio_invalidate(), which ends up at btrfs' folio
+			 *    invalidate callback - btrfs_invalidate_folio().
+			 *    There it tries to lock the folio's range in inode
+			 *    Y's extent io tree, but it blocks since it's
+			 *    currently locked by task A - during reflink we
+			 *    lock the inodes and the source and destination
+			 *    ranges after flushing all delalloc and waiting for
+			 *    ordered extent completion - after that we don't
+			 *    expect to have dirty folios in the ranges, the
+			 *    exception is if we have to copy an inline extent's
+			 *    data (because the destination offset is not zero);
+			 *
+			 * 4) Task A then does the 'goto out' below and attempts
+			 *    to start a transaction to update the inode item,
+			 *    and then it's blocked since the current
+			 *    transaction is in the TRANS_STATE_COMMIT_START
+			 *    state. Therefore task A has to wait for the
+			 *    current transaction to become unblocked (its
+			 *    state >= TRANS_STATE_UNBLOCKED).
+			 *
+			 * This leads to a deadlock - the task committing the
+			 * transaction waiting for the delalloc flushing which
+			 * is blocked during folio invalidation on the inode's
+			 * extent lock and the reflink task waiting for the
+			 * current transaction to be unblocked so that it can
+			 * start a new one to update the inode item (while
+			 * holding the extent lock).
+			 */
+			i_size_write(&inode->vfs_inode, new_key->offset + datal);
+		}
 		/*
 		 * No transaction here means we copied the inline extent into a
 		 * page of the destination inode.
@@ -322,50 +376,7 @@ copy_to_page:
 
 	ret = copy_inline_to_page(inode, new_key->offset,
 				  inline_data, size, datal, comp_type);
-
-	/*
-	 * If we copied the inline extent data to a page/folio beyond the i_size
-	 * of the destination inode, then we need to increase the i_size before
-	 * we start a transaction to update the inode item. This is to prevent a
-	 * deadlock when the flushoncommit mount option is used, which happens
-	 * like this:
-	 *
-	 * 1) Task A clones an inline extent from inode X to an offset of inode
-	 *    Y that is beyond Y's current i_size. This means we copied the
-	 *    inline extent's data to a folio of inode Y that is beyond its EOF,
-	 *    using the call above to copy_inline_to_page();
-	 *
-	 * 2) Task B starts a transaction commit and calls
-	 *    btrfs_start_delalloc_flush() to flush delalloc;
-	 *
-	 * 3) The delalloc flushing sees the new dirty folio of inode Y and when
-	 *    it attempts to flush it, it ends up at extent_writepage() and sees
-	 *    that the offset of the folio is beyond the i_size of inode Y, so
-	 *    it attempts to invalidate the folio by calling folio_invalidate(),
-	 *    which ends up at btrfs' folio invalidate callback -
-	 *    btrfs_invalidate_folio(). There it tries to lock the folio's range
-	 *    in inode Y's extent io tree, but it blocks since it's currently
-	 *    locked by task A - during reflink we lock the inodes and the
-	 *    source and destination ranges after flushing all delalloc and
-	 *    waiting for ordered extent completion - after that we don't expect
-	 *    to have dirty folios in the ranges, the exception is if we have to
-	 *    copy an inline extent's data (because the destination offset is
-	 *    not zero);
-	 *
-	 * 4) Task A then does the 'goto out' below and attempts to start a
-	 *    transaction to update the inode item, and then it's blocked since
-	 *    the current transaction is in the TRANS_STATE_COMMIT_START state.
-	 *    Therefore task A has to wait for the current transaction to become
-	 *    unblocked (its state >= TRANS_STATE_UNBLOCKED).
-	 *
-	 * This leads to a deadlock - the task committing the transaction
-	 * waiting for the delalloc flushing which is blocked during folio
-	 * invalidation on the inode's extent lock and the reflink task waiting
-	 * for the current transaction to be unblocked so that it can start a
-	 * a new one to update the inode item (while holding the extent lock).
-	 */
-	if (ret == 0 && new_key->offset + datal > i_size_read(&inode->vfs_inode))
-		i_size_write(&inode->vfs_inode, new_key->offset + datal);
+	copied_inline_to_page = (ret == 0);
 
 	goto out;
 }
@@ -459,7 +470,7 @@ process_slot:
 		    key.objectid != btrfs_ino(BTRFS_I(src)))
 			break;
 
-		ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
+		ASSERT(key.type == BTRFS_EXTENT_DATA_KEY, "key.type=%u", key.type);
 
 		extent = btrfs_item_ptr(leaf, slot,
 					struct btrfs_file_extent_item);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 3ebaf5880125..fb85bc8b345c 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -178,8 +178,101 @@ struct reloc_control {
 	bool create_reloc_tree;
 	bool merge_reloc_tree;
 	bool found_file_extent;
+
+	refcount_t refs;
 };
 
+static struct reloc_control *get_reloc_control(struct btrfs_fs_info *fs_info)
+{
+	struct reloc_control *rc;
+
+	/* Quick path, avoid lock contention on fs_info->reloc_ctl_lock. */
+	if (!data_race(fs_info->reloc_ctl))
+		return NULL;
+
+	spin_lock(&fs_info->reloc_ctl_lock);
+	rc = fs_info->reloc_ctl;
+	if (rc)
+		refcount_inc(&rc->refs);
+	spin_unlock(&fs_info->reloc_ctl_lock);
+
+	return rc;
+}
+
+static void __del_reloc_root(struct btrfs_root *root);
+
+static noinline_for_stack void free_reloc_roots(struct list_head *list)
+{
+	struct btrfs_root *reloc_root, *tmp;
+
+	list_for_each_entry_safe(reloc_root, tmp, list, root_list)
+		__del_reloc_root(reloc_root);
+}
+
+static void put_reloc_control(struct reloc_control *rc)
+{
+	if (refcount_dec_and_test(&rc->refs)) {
+		struct mapping_node *node, *tmp;
+
+		if (rc->extent_root)
+			ASSERT(rc->extent_root->fs_info->reloc_ctl != rc);
+
+		free_reloc_roots(&rc->reloc_roots);
+		rbtree_postorder_for_each_entry_safe(node, tmp,
+						     &rc->reloc_root_tree.rb_root,
+						     rb_node)
+			kfree(node);
+
+		if (rc->block_group)
+			btrfs_put_block_group(rc->block_group);
+
+		kfree(rc);
+	}
+}
+
+/* Helper to delete the 'address of tree root -> reloc tree' mapping. */
+static void __del_reloc_root(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct rb_node *rb_node;
+	struct mapping_node AUTO_KFREE(node);
+	struct reloc_control *rc;
+	bool put_ref = false;
+
+	rc = get_reloc_control(fs_info);
+	if (rc && root->node) {
+		spin_lock(&rc->reloc_root_tree.lock);
+		rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
+					   root->commit_root->start);
+		if (rb_node) {
+			node = rb_entry(rb_node, struct mapping_node, rb_node);
+			rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
+			RB_CLEAR_NODE(&node->rb_node);
+		}
+		spin_unlock(&rc->reloc_root_tree.lock);
+		ASSERT(!node || (struct btrfs_root *)node->data == root);
+	}
+
+	/*
+	 * We only put the reloc root here if it's on the list.  There's a lot
+	 * of places where the pattern is to splice the rc->reloc_roots, process
+	 * the reloc roots, and then add the reloc root back onto
+	 * rc->reloc_roots.  If we call __del_reloc_root while it's off of the
+	 * list we don't want the reference being dropped, because the guy
+	 * messing with the list is in charge of the reference.
+	 */
+	spin_lock(&fs_info->trans_lock);
+	if (!list_empty(&root->root_list)) {
+		put_ref = true;
+		list_del_init(&root->root_list);
+	}
+	spin_unlock(&fs_info->trans_lock);
+	if (put_ref)
+		btrfs_put_root(root);
+	if (rc)
+		put_reloc_control(rc);
+}
+
 static void mark_block_processed(struct reloc_control *rc,
 				 struct btrfs_backref_node *node)
 {
@@ -407,7 +500,7 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
 			struct reloc_control *rc, struct btrfs_key *node_key,
 			int level, u64 bytenr)
 {
-	struct btrfs_backref_iter *iter;
+	struct btrfs_backref_iter iter;
 	struct btrfs_backref_cache *cache = &rc->backref_cache;
 	/* For searching parent of TREE_BLOCK_REF */
 	struct btrfs_path *path;
@@ -416,9 +509,9 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
 	struct btrfs_backref_edge *edge;
 	int ret;
 
-	iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info);
-	if (!iter)
-		return ERR_PTR(-ENOMEM);
+	ret = btrfs_backref_iter_init(&iter);
+	if (ret < 0)
+		return ERR_PTR(ret);
 	path = btrfs_alloc_path();
 	if (!path) {
 		ret = -ENOMEM;
@@ -435,7 +528,7 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
 
 	/* Breadth-first search to build backref cache */
 	do {
-		ret = btrfs_backref_add_tree_node(trans, cache, path, iter,
+		ret = btrfs_backref_add_tree_node(trans, cache, path, &iter,
 						  node_key, cur);
 		if (ret < 0)
 			goto out;
@@ -460,8 +553,7 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
 	if (handle_useless_nodes(rc, node))
 		node = NULL;
 out:
-	btrfs_free_path(iter->path);
-	kfree(iter);
+	btrfs_free_path(iter.path);
 	btrfs_free_path(path);
 	if (ret) {
 		btrfs_backref_error_cleanup(cache, node);
@@ -476,12 +568,11 @@ out:
 /*
  * helper to add 'address of tree root -> reloc tree' mapping
  */
-static int __add_reloc_root(struct btrfs_root *root)
+static int __add_reloc_root(struct btrfs_root *root, struct reloc_control *rc)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct rb_node *rb_node;
 	struct mapping_node *node;
-	struct reloc_control *rc = fs_info->reloc_ctl;
 
 	node = kmalloc_obj(*node, GFP_NOFS);
 	if (!node)
@@ -505,49 +596,6 @@ static int __add_reloc_root(struct btrfs_root *root)
 }
 
 /*
- * helper to delete the 'address of tree root -> reloc tree'
- * mapping
- */
-static void __del_reloc_root(struct btrfs_root *root)
-{
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct rb_node *rb_node;
-	struct mapping_node AUTO_KFREE(node);
-	struct reloc_control *rc = fs_info->reloc_ctl;
-	bool put_ref = false;
-
-	if (rc && root->node) {
-		spin_lock(&rc->reloc_root_tree.lock);
-		rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
-					   root->commit_root->start);
-		if (rb_node) {
-			node = rb_entry(rb_node, struct mapping_node, rb_node);
-			rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
-			RB_CLEAR_NODE(&node->rb_node);
-		}
-		spin_unlock(&rc->reloc_root_tree.lock);
-		ASSERT(!node || (struct btrfs_root *)node->data == root);
-	}
-
-	/*
-	 * We only put the reloc root here if it's on the list.  There's a lot
-	 * of places where the pattern is to splice the rc->reloc_roots, process
-	 * the reloc roots, and then add the reloc root back onto
-	 * rc->reloc_roots.  If we call __del_reloc_root while it's off of the
-	 * list we don't want the reference being dropped, because the guy
-	 * messing with the list is in charge of the reference.
-	 */
-	spin_lock(&fs_info->trans_lock);
-	if (!list_empty(&root->root_list)) {
-		put_ref = true;
-		list_del_init(&root->root_list);
-	}
-	spin_unlock(&fs_info->trans_lock);
-	if (put_ref)
-		btrfs_put_root(root);
-}
-
-/*
  * helper to update the 'address of tree root -> reloc tree'
  * mapping
  */
@@ -590,7 +638,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
 	struct btrfs_key root_key;
 	int ret = 0;
 
-	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
+	root_item = kmalloc_obj(*root_item, GFP_NOFS);
 	if (!root_item)
 		return ERR_PTR(-ENOMEM);
 
@@ -700,11 +748,12 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_root *reloc_root;
-	struct reloc_control *rc = fs_info->reloc_ctl;
+	struct reloc_control *rc;
 	struct btrfs_block_rsv *rsv;
-	int clear_rsv = 0;
-	int ret;
+	bool clear_rsv = false;
+	int ret = 0;
 
+	rc = get_reloc_control(fs_info);
 	if (!rc)
 		return 0;
 
@@ -713,7 +762,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
 	 * create/update the dead reloc tree
 	 */
 	if (reloc_root_is_dead(root))
-		return 0;
+		goto out;
 
 	/*
 	 * This is subtle but important.  We do not do
@@ -724,9 +773,8 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
 	 * in.
 	 */
 	if (root->reloc_root) {
-		reloc_root = root->reloc_root;
-		btrfs_set_root_last_trans(reloc_root, trans->transid);
-		return 0;
+		btrfs_set_root_last_trans(root->reloc_root, trans->transid);
+		goto out;
 	}
 
 	/*
@@ -734,28 +782,33 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
 	 * reloc trees never need their own reloc tree.
 	 */
 	if (!rc->create_reloc_tree || btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
-		return 0;
+		goto out;
 
 	if (!trans->reloc_reserved) {
 		rsv = trans->block_rsv;
 		trans->block_rsv = rc->block_rsv;
-		clear_rsv = 1;
+		clear_rsv = true;
 	}
 	reloc_root = create_reloc_root(trans, root, btrfs_root_id(root));
 	if (clear_rsv)
 		trans->block_rsv = rsv;
-	if (IS_ERR(reloc_root))
-		return PTR_ERR(reloc_root);
+	if (IS_ERR(reloc_root)) {
+		ret = PTR_ERR(reloc_root);
+		goto out;
+	}
 
-	ret = __add_reloc_root(reloc_root);
+	ret = __add_reloc_root(reloc_root, rc);
 	ASSERT(ret != -EEXIST);
 	if (ret) {
 		/* Pairs with create_reloc_root */
 		btrfs_put_root(reloc_root);
-		return ret;
+		goto out;
 	}
 	root->reloc_root = btrfs_grab_root(reloc_root);
-	return 0;
+out:
+	put_reloc_control(rc);
+
+	return ret;
 }
 
 /*
@@ -767,6 +820,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_root *reloc_root;
 	struct btrfs_root_item *root_item;
+	struct reloc_control *rc;
 	int ret;
 
 	if (!have_reloc_root(root))
@@ -782,9 +836,9 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
 	 */
 	btrfs_grab_root(reloc_root);
 
+	rc = get_reloc_control(fs_info);
 	/* root->reloc_root will stay until current relocation finished */
-	if (fs_info->reloc_ctl && fs_info->reloc_ctl->merge_reloc_tree &&
-	    btrfs_root_refs(root_item) == 0) {
+	if (rc && rc->merge_reloc_tree && btrfs_root_refs(root_item) == 0) {
 		set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
 		/*
 		 * Mark the tree as dead before we change reloc_root so
@@ -804,6 +858,9 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
 	ret = btrfs_update_root(trans, fs_info->tree_root,
 				&reloc_root->root_key, root_item);
 	btrfs_put_root(reloc_root);
+	if (rc)
+		put_reloc_control(rc);
+
 	return ret;
 }
 
@@ -814,6 +871,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
 			    u64 bytenr, u64 num_bytes)
 {
 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_file_extent_item *fi;
 	struct extent_buffer *leaf;
@@ -835,10 +893,23 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
 	fi = btrfs_item_ptr(leaf, path->slots[0],
 			    struct btrfs_file_extent_item);
 
-	BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
-	       btrfs_file_extent_compression(leaf, fi) ||
-	       btrfs_file_extent_encryption(leaf, fi) ||
-	       btrfs_file_extent_other_encoding(leaf, fi));
+	/*
+	 * The cluster-boundary key searched above is always written by
+	 * relocation with offset 0: either by insert_prealloc_file_extent()
+	 * (memsets the stack item to 0) or by the front portion of a partial
+	 * writeback (offset=0 by construction). A non-zero value here means
+	 * the on-disk leaf does not match what relocation wrote, i.e.
+	 * corruption. The other encoding fields are caught earlier by
+	 * tree-checker's check_extent_data_item().
+	 */
+	if (unlikely(btrfs_file_extent_offset(leaf, fi))) {
+		btrfs_print_leaf(leaf);
+		btrfs_err(fs_info,
+"unexpected non-zero offset in file extent item for data reloc inode %llu key offset %llu offset %llu",
+			  btrfs_ino(BTRFS_I(reloc_inode)), bytenr,
+			  btrfs_file_extent_offset(leaf, fi));
+		return -EUCLEAN;
+	}
 
 	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi))
 		return -EINVAL;
@@ -869,7 +940,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 	u32 nritems;
 	u32 i;
 	int ret = 0;
-	int first = 1;
+	bool first = true;
 
 	if (rc->stage != UPDATE_DATA_PTRS)
 		return 0;
@@ -907,7 +978,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 		if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) {
 			if (first) {
 				inode = btrfs_find_first_inode(root, key.objectid);
-				first = 0;
+				first = false;
 			} else if (inode && btrfs_ino(inode) < key.objectid) {
 				btrfs_add_delayed_iput(inode);
 				inode = btrfs_find_first_inode(root, key.objectid);
@@ -1021,7 +1092,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
 	u64 new_ptr_gen;
 	u64 last_snapshot;
 	u32 blocksize;
-	int cow = 0;
+	bool cow = false;
 	int level;
 	int ret;
 	int slot;
@@ -1127,7 +1198,7 @@ again:
 		if (!cow) {
 			btrfs_tree_unlock(parent);
 			free_extent_buffer(parent);
-			cow = 1;
+			cow = true;
 			goto again;
 		}
 
@@ -1515,7 +1586,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 	int reserve_level;
 	int level;
 	int max_level;
-	int replaced = 0;
+	bool replaced = false;
 	int ret = 0;
 	u32 min_reserved;
 
@@ -1561,7 +1632,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 	 * and * 2 since we have two trees to COW.
 	 */
 	reserve_level = max_t(int, 1, btrfs_root_level(root_item));
-	min_reserved = fs_info->nodesize * reserve_level * 2;
+	min_reserved = (reserve_level << fs_info->nodesize_bits) * 2;
 	memset(&next_key, 0, sizeof(next_key));
 
 	while (1) {
@@ -1590,7 +1661,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 		btrfs_set_root_last_trans(reloc_root, trans->transid);
 		trans->block_rsv = rc->block_rsv;
 
-		replaced = 0;
+		replaced = false;
 		max_level = level;
 
 		ret = walk_down_reloc_tree(reloc_root, path, &level);
@@ -1612,7 +1683,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 			level = ret;
 			btrfs_node_key_to_cpu(path->nodes[level], &key,
 					      path->slots[level]);
-			replaced = 1;
+			replaced = true;
 		}
 
 		ret = walk_up_reloc_tree(reloc_root, path, &level);
@@ -1795,22 +1866,13 @@ again:
 }
 
 static noinline_for_stack
-void free_reloc_roots(struct list_head *list)
-{
-	struct btrfs_root *reloc_root, *tmp;
-
-	list_for_each_entry_safe(reloc_root, tmp, list, root_list)
-		__del_reloc_root(reloc_root);
-}
-
-static noinline_for_stack
 void merge_reloc_roots(struct reloc_control *rc)
 {
 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
 	struct btrfs_root *root;
 	struct btrfs_root *reloc_root;
 	LIST_HEAD(reloc_roots);
-	int found = 0;
+	bool found = false;
 	int ret = 0;
 again:
 	root = rc->extent_root;
@@ -1826,7 +1888,7 @@ again:
 	mutex_unlock(&fs_info->reloc_mutex);
 
 	while (!list_empty(&reloc_roots)) {
-		found = 1;
+		found = true;
 		reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list);
 
 		root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
@@ -1879,7 +1941,7 @@ again:
 	}
 
 	if (found) {
-		found = 0;
+		found = false;
 		goto again;
 	}
 out:
@@ -1907,7 +1969,7 @@ out:
 	 * do the reloc_dirty_list afterwards.  Meanwhile the root->reloc_root
 	 * will be cleaned up on unmount.
 	 *
-	 * The remaining nodes will be cleaned up by free_reloc_control.
+	 * The remaining nodes will be cleaned up by put_reloc_control().
 	 */
 }
 
@@ -2559,7 +2621,7 @@ static int relocate_cowonly_block(struct btrfs_trans_handle *trans,
 
 	nr_levels = max(btrfs_header_level(root->node) - block->level, 0) + 1;
 
-	num_bytes = fs_info->nodesize * nr_levels;
+	num_bytes = (nr_levels << fs_info->nodesize_bits);
 	ret = refill_metadata_space(trans, rc, num_bytes);
 	if (ret) {
 		btrfs_put_root(root);
@@ -2944,7 +3006,7 @@ static int relocate_file_extent_cluster(struct reloc_control *rc)
 	if (!cluster->nr)
 		return 0;
 
-	ra = kzalloc(sizeof(*ra), GFP_NOFS);
+	ra = kzalloc_obj(*ra, GFP_NOFS);
 	if (!ra)
 		return -ENOMEM;
 
@@ -3187,13 +3249,12 @@ again:
 			goto again;
 		}
 	}
-	if (ret) {
+	if (WARN_ON(ret)) {
 		ASSERT(ret == 1);
 		btrfs_print_leaf(path->nodes[0]);
 		btrfs_err(fs_info,
 	     "tree block extent item (%llu) is not found in extent tree",
 		     bytenr);
-		WARN_ON(1);
 		return -EINVAL;
 	}
 
@@ -3421,7 +3482,9 @@ static void set_reloc_control(struct reloc_control *rc)
 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
 
 	mutex_lock(&fs_info->reloc_mutex);
+	spin_lock(&fs_info->reloc_ctl_lock);
 	fs_info->reloc_ctl = rc;
+	spin_unlock(&fs_info->reloc_ctl_lock);
 	mutex_unlock(&fs_info->reloc_mutex);
 }
 
@@ -3430,7 +3493,9 @@ static void unset_reloc_control(struct reloc_control *rc)
 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
 
 	mutex_lock(&fs_info->reloc_mutex);
+	spin_lock(&fs_info->reloc_ctl_lock);
 	fs_info->reloc_ctl = NULL;
+	spin_unlock(&fs_info->reloc_ctl_lock);
 	mutex_unlock(&fs_info->reloc_mutex);
 }
 
@@ -3815,19 +3880,9 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
 	rc->reloc_root_tree.rb_root = RB_ROOT;
 	spin_lock_init(&rc->reloc_root_tree.lock);
 	btrfs_extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS);
-	return rc;
-}
+	refcount_set(&rc->refs, 1);
 
-static void free_reloc_control(struct reloc_control *rc)
-{
-	struct mapping_node *node, *tmp;
-
-	free_reloc_roots(&rc->reloc_roots);
-	rbtree_postorder_for_each_entry_safe(node, tmp,
-			&rc->reloc_root_tree.rb_root, rb_node)
-		kfree(node);
-
-	kfree(rc);
+	return rc;
 }
 
 /*
@@ -3863,7 +3918,7 @@ static int add_remap_tree_entries(struct btrfs_trans_handle *trans, struct btrfs
 
 	max_items = BTRFS_LEAF_DATA_SIZE(trans->fs_info) / sizeof(struct btrfs_item);
 
-	data_sizes = kzalloc(sizeof(u32) * min_t(u32, num_entries, max_items), GFP_NOFS);
+	data_sizes = kzalloc_objs(u32, min_t(u32, num_entries, max_items), GFP_NOFS);
 	if (!data_sizes)
 		return -ENOMEM;
 
@@ -4038,7 +4093,7 @@ static int copy_remapped_data(struct btrfs_fs_info *fs_info, u64 old_addr,
 	if (!pages)
 		return -ENOMEM;
 
-	ret = btrfs_alloc_page_array(nr_pages, pages, 0);
+	ret = btrfs_alloc_page_array(nr_pages, pages, GFP_NOFS);
 	if (ret) {
 		ret = -ENOMEM;
 		goto end;
@@ -4454,7 +4509,7 @@ static int create_remap_tree_entries(struct btrfs_trans_handle *trans,
 
 	btrfs_release_path(path);
 
-	space_runs = kmalloc(sizeof(*space_runs) * extent_count, GFP_NOFS);
+	space_runs = kmalloc_objs(*space_runs, extent_count, GFP_NOFS);
 	if (!space_runs) {
 		mutex_unlock(&bg->free_space_lock);
 		return -ENOMEM;
@@ -4543,7 +4598,7 @@ static int create_remap_tree_entries(struct btrfs_trans_handle *trans,
 	mutex_unlock(&bg->free_space_lock);
 
 	max_entries = extent_count + 2;
-	entries = kmalloc(sizeof(*entries) * max_entries, GFP_NOFS);
+	entries = kmalloc_objs(*entries, max_entries, GFP_NOFS);
 	if (!entries) {
 		ret = -ENOMEM;
 		goto out;
@@ -5367,13 +5422,14 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
 		return -ENOMEM;
 	}
 
-	ret = reloc_chunk_start(fs_info);
-	if (ret < 0)
-		goto out_put_bg;
-
 	rc->extent_root = extent_root;
+	/* Block group ref now owned by rc, put_reloc_control() will drop it. */
 	rc->block_group = bg;
 
+	ret = reloc_chunk_start(fs_info);
+	if (ret < 0)
+		goto out_put_rc;
+
 	ret = btrfs_inc_block_group_ro(rc->block_group, true);
 	if (ret)
 		goto out;
@@ -5441,9 +5497,8 @@ out:
 		iput(rc->data_inode);
 	btrfs_free_path(path);
 	reloc_chunk_end(fs_info);
-out_put_bg:
-	btrfs_put_block_group(bg);
-	free_reloc_control(rc);
+out_put_rc:
+	put_reloc_control(rc);
 	return ret;
 }
 
@@ -5598,7 +5653,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
 			goto out_unset;
 		}
 
-		ret = __add_reloc_root(reloc_root);
+		ret = __add_reloc_root(reloc_root, rc);
 		ASSERT(ret != -EEXIST);
 		if (ret) {
 			list_add_tail(&reloc_root->root_list, &reloc_roots);
@@ -5632,7 +5687,7 @@ out_unset:
 	unset_reloc_control(rc);
 	reloc_chunk_end(fs_info);
 out_end:
-	free_reloc_control(rc);
+	put_reloc_control(rc);
 out:
 	free_reloc_roots(&reloc_roots);
 
@@ -5712,11 +5767,11 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct reloc_control *rc;
 	struct btrfs_backref_node *node;
-	int first_cow = 0;
+	bool first_cow = false;
 	int level;
 	int ret = 0;
 
-	rc = fs_info->reloc_ctl;
+	rc = get_reloc_control(fs_info);
 	if (!rc)
 		return 0;
 
@@ -5725,7 +5780,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
 	level = btrfs_header_level(buf);
 	if (btrfs_header_generation(buf) <=
 	    btrfs_root_last_snapshot(&root->root_item))
-		first_cow = 1;
+		first_cow = true;
 
 	if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID && rc->create_reloc_tree) {
 		WARN_ON(!first_cow && level == 0);
@@ -5741,7 +5796,8 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
 			btrfs_err(fs_info,
 "bytenr %llu was found but our backref cache was expecting %llu or %llu",
 				  buf->start, node->bytenr, node->new_bytenr);
-			return -EUCLEAN;
+			ret = -EUCLEAN;
+			goto out;
 		}
 
 		btrfs_backref_drop_node_buffer(node);
@@ -5764,6 +5820,9 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
 
 	if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
 		ret = replace_file_extents(trans, rc, root, cow);
+out:
+	put_reloc_control(rc);
+
 	return ret;
 }
 
@@ -5812,13 +5871,16 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
 	struct btrfs_root *root = pending->root;
 	struct btrfs_root *reloc_root;
 	struct btrfs_root *new_root;
-	struct reloc_control *rc = root->fs_info->reloc_ctl;
-	int ret;
+	struct reloc_control *rc;
+	int ret = 0;
 
-	if (!rc || !have_reloc_root(root))
+	rc = get_reloc_control(trans->fs_info);
+	if (!rc)
 		return 0;
 
-	rc = root->fs_info->reloc_ctl;
+	if (!have_reloc_root(root))
+		goto out;
+
 	rc->merging_rsv_size += rc->nodes_relocated;
 
 	if (rc->merge_reloc_tree) {
@@ -5826,23 +5888,28 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
 					      rc->block_rsv,
 					      rc->nodes_relocated, true);
 		if (ret)
-			return ret;
+			goto out;
 	}
 
 	new_root = pending->snap;
 	reloc_root = create_reloc_root(trans, root->reloc_root, btrfs_root_id(new_root));
-	if (IS_ERR(reloc_root))
-		return PTR_ERR(reloc_root);
+	if (IS_ERR(reloc_root)) {
+		ret = PTR_ERR(reloc_root);
+		goto out;
+	}
 
-	ret = __add_reloc_root(reloc_root);
+	ret = __add_reloc_root(reloc_root, rc);
 	ASSERT(ret != -EEXIST);
 	if (ret) {
 		/* Pairs with create_reloc_root */
 		btrfs_put_root(reloc_root);
-		return ret;
+		goto out;
 	}
 	new_root->reloc_root = btrfs_grab_root(reloc_root);
-	return 0;
+out:
+	put_reloc_control(rc);
+
+	return ret;
 }
 
 /*
@@ -5850,14 +5917,15 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
  *
  * Return U64_MAX if no running relocation.
  */
-u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info)
+u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info)
 {
 	u64 logical = U64_MAX;
 
-	lockdep_assert_held(&fs_info->reloc_mutex);
-
+	mutex_lock(&fs_info->reloc_mutex);
 	if (fs_info->reloc_ctl && fs_info->reloc_ctl->block_group)
 		logical = fs_info->reloc_ctl->block_group->start;
+	mutex_unlock(&fs_info->reloc_mutex);
+
 	return logical;
 }
 
diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
index d647823b5d13..bb7a86e7dbe3 100644
--- a/fs/btrfs/relocation.h
+++ b/fs/btrfs/relocation.h
@@ -41,7 +41,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
 int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info);
 struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr);
 bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root);
-u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info);
+u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info);
 int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical, u64 *length);
 int btrfs_remove_extent_from_remap_tree(struct btrfs_trans_handle *trans,
 					struct btrfs_path *path,
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index d85a09ae1733..90659b287d90 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -27,20 +27,20 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
 				struct btrfs_root_item *item)
 {
 	u32 len;
-	int need_reset = 0;
+	bool need_reset = false;
 
 	len = btrfs_item_size(eb, slot);
 	read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
 			   min_t(u32, len, sizeof(*item)));
 	if (len < sizeof(*item))
-		need_reset = 1;
+		need_reset = true;
 	if (!need_reset && btrfs_root_generation(item)
 		!= btrfs_root_generation_v2(item)) {
 		if (btrfs_root_generation_v2(item) != 0) {
 			btrfs_warn(eb->fs_info,
 					"mismatching generation and generation_v2 found in root item. This root was probably mounted with an older kernel. Resetting all new fields.");
 		}
-		need_reset = 1;
+		need_reset = true;
 	}
 	if (need_reset) {
 		/* Clear all members from generation_v2 onwards. */
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 1ac609239cbe..d2f7ac5b6e96 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -369,7 +369,8 @@ static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
 
 	ASSERT(BTRFS_STRIPE_LEN >> min_folio_shift <= SCRUB_STRIPE_MAX_FOLIOS);
 	ret = btrfs_alloc_folio_array(BTRFS_STRIPE_LEN >> min_folio_shift,
-				      fs_info->block_min_order, stripe->folios);
+				      fs_info->block_min_order, stripe->folios,
+				      GFP_NOFS);
 	if (ret < 0)
 		goto error;
 
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 89d72d8cb85f..3ae480c7474b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -17,6 +17,7 @@
 #include <linux/compat.h>
 #include <linux/crc32c.h>
 #include <linux/fsverity.h>
+#include <linux/cleanup.h>
 #include "send.h"
 #include "ctree.h"
 #include "backref.h"
@@ -72,6 +73,8 @@ struct fs_path {
 #define FS_PATH_INLINE_SIZE \
 	sizeof_field(struct fs_path, inline_buf)
 
+static void fs_path_free(struct fs_path *p);
+DEFINE_FREE(fs_path_free, struct fs_path *, fs_path_free(_T))
 
 /* reused for each extent */
 struct clone_root {
@@ -981,7 +984,7 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
 	struct btrfs_inode_ref *iref;
 	struct btrfs_inode_extref *extref;
 	BTRFS_PATH_AUTO_FREE(tmp_path);
-	struct fs_path *p;
+	struct fs_path *p __free(fs_path_free) = NULL;
 	u32 cur = 0;
 	u32 total;
 	int slot = path->slots[0];
@@ -998,11 +1001,8 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
 		return -ENOMEM;
 
 	tmp_path = alloc_path_for_send();
-	if (!tmp_path) {
-		fs_path_free(p);
+	if (!tmp_path)
 		return -ENOMEM;
-	}
-
 
 	if (found_key->type == BTRFS_INODE_REF_KEY) {
 		ptr = (unsigned long)btrfs_item_ptr(eb, slot,
@@ -1034,30 +1034,27 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
 			start = btrfs_ref_to_path(root, tmp_path, name_len,
 						  name_off, eb, dir,
 						  p->buf, p->buf_len);
-			if (IS_ERR(start)) {
-				ret = PTR_ERR(start);
-				goto out;
-			}
+			if (IS_ERR(start))
+				return PTR_ERR(start);
+
 			if (start < p->buf) {
 				/* overflow , try again with larger buffer */
 				ret = fs_path_ensure_buf(p,
 						p->buf_len + p->buf - start);
 				if (ret < 0)
-					goto out;
+					return ret;
 				start = btrfs_ref_to_path(root, tmp_path,
 							  name_len, name_off,
 							  eb, dir,
 							  p->buf, p->buf_len);
-				if (IS_ERR(start)) {
-					ret = PTR_ERR(start);
-					goto out;
-				}
+				if (IS_ERR(start))
+					return PTR_ERR(start);
+
 				if (unlikely(start < p->buf)) {
 					btrfs_err(root->fs_info,
 			  "send: path ref buffer underflow for key " BTRFS_KEY_FMT,
 						  BTRFS_KEY_FMT_VALUE(found_key));
-					ret = -EINVAL;
-					goto out;
+					return -EINVAL;
 				}
 			}
 			p->start = start;
@@ -1065,17 +1062,15 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
 			ret = fs_path_add_from_extent_buffer(p, eb, name_off,
 							     name_len);
 			if (ret < 0)
-				goto out;
+				return ret;
 		}
 
 		cur += elem_size + name_len;
 		ret = iterate(dir, p, ctx);
 		if (ret)
-			goto out;
+			return ret;
 	}
 
-out:
-	fs_path_free(p);
 	return ret;
 }
 
@@ -2028,7 +2023,7 @@ static int is_first_ref(struct btrfs_root *root,
 			const char *name, int name_len)
 {
 	int ret;
-	struct fs_path *tmp_name;
+	struct fs_path *tmp_name __free(fs_path_free) = NULL;
 	u64 tmp_dir;
 
 	tmp_name = fs_path_alloc();
@@ -2037,17 +2032,13 @@ static int is_first_ref(struct btrfs_root *root,
 
 	ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name);
 	if (ret < 0)
-		goto out;
+		return ret;
 
-	if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) {
-		ret = 0;
-		goto out;
-	}
+	if (dir != tmp_dir || name_len != fs_path_len(tmp_name))
+		return 0;
 
 	ret = !memcmp(tmp_name->start, name, name_len);
 
-out:
-	fs_path_free(tmp_name);
 	return ret;
 }
 
@@ -2196,13 +2187,13 @@ static int did_overwrite_ref(struct send_ctx *sctx,
  */
 static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
 {
-	int ret = 0;
-	struct fs_path *name = NULL;
+	int ret;
+	struct fs_path *name __free(fs_path_free) = NULL;
 	u64 dir;
 	u64 dir_gen;
 
 	if (!sctx->parent_root)
-		goto out;
+		return 0;
 
 	name = fs_path_alloc();
 	if (!name)
@@ -2210,14 +2201,10 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
 
 	ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name);
 	if (ret < 0)
-		goto out;
-
-	ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
-			name->start, fs_path_len(name));
+		return ret;
 
-out:
-	fs_path_free(name);
-	return ret;
+	return did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
+				 name->start, fs_path_len(name));
 }
 
 static inline struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
@@ -2375,10 +2362,10 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
 			struct fs_path *dest)
 {
 	int ret = 0;
-	struct fs_path *name = NULL;
+	struct fs_path *name __free(fs_path_free) = NULL;
 	u64 parent_inode = 0;
 	u64 parent_gen = 0;
-	int stop = 0;
+	bool stop = false;
 	const bool is_cur_inode = (ino == sctx->cur_ino && gen == sctx->cur_inode_gen);
 
 	if (is_cur_inode && fs_path_len(&sctx->cur_inode_path) > 0) {
@@ -2389,10 +2376,8 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
 	}
 
 	name = fs_path_alloc();
-	if (!name) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	if (!name)
+		return -ENOMEM;
 
 	dest->reversed = 1;
 	fs_path_reset(dest);
@@ -2413,7 +2398,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
 		wdm = get_waiting_dir_move(sctx, ino);
 		if (wdm && wdm->orphanized) {
 			ret = gen_unique_name(sctx, ino, gen, name);
-			stop = 1;
+			stop = true;
 		} else if (wdm) {
 			ret = get_first_ref(sctx->parent_root, ino,
 					    &parent_inode, &parent_gen, name);
@@ -2422,7 +2407,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
 							&parent_inode,
 							&parent_gen, name);
 			if (ret)
-				stop = 1;
+				stop = true;
 		}
 
 		if (ret < 0)
@@ -2437,7 +2422,6 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
 	}
 
 out:
-	fs_path_free(name);
 	if (!ret) {
 		fs_path_unreverse(dest);
 		if (is_cur_inode && dest != &sctx->cur_inode_path)
@@ -2787,7 +2771,7 @@ static int trim_dir_utimes_cache(struct send_ctx *sctx)
 static int send_create_inode(struct send_ctx *sctx, u64 ino)
 {
 	int ret = 0;
-	struct fs_path *p;
+	struct fs_path *p __free(fs_path_free) = NULL;
 	int cmd;
 	struct btrfs_inode_info info;
 	u64 gen;
@@ -2801,7 +2785,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
 	if (ino != sctx->cur_ino) {
 		ret = get_inode_info(sctx->send_root, ino, &info);
 		if (ret < 0)
-			goto out;
+			return ret;
 		gen = info.gen;
 		mode = info.mode;
 		rdev = info.rdev;
@@ -2826,17 +2810,16 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
 	} else {
 		btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o",
 				(int)(mode & S_IFMT));
-		ret = -EOPNOTSUPP;
-		goto out;
+		return -EOPNOTSUPP;
 	}
 
 	ret = begin_cmd(sctx, cmd);
 	if (ret < 0)
-		goto out;
+		return ret;
 
 	ret = gen_unique_name(sctx, ino, gen, p);
 	if (ret < 0)
-		goto out;
+		return ret;
 
 	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
 	TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
@@ -2845,7 +2828,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
 		fs_path_reset(p);
 		ret = read_symlink(sctx->send_root, ino, p);
 		if (ret < 0)
-			goto out;
+			return ret;
 		TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
 	} else if (S_ISCHR(mode) || S_ISBLK(mode) ||
 		   S_ISFIFO(mode) || S_ISSOCK(mode)) {
@@ -2855,12 +2838,9 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
 
 	ret = send_cmd(sctx);
 	if (ret < 0)
-		goto out;
-
+		return ret;
 
 tlv_put_failure:
-out:
-	fs_path_free(p);
 	return ret;
 }
 
@@ -3039,7 +3019,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
 			  struct fs_path *path)
 {
 	int ret;
-	struct fs_path *orphan;
+	struct fs_path *orphan __free(fs_path_free) = NULL;
 
 	orphan = fs_path_alloc();
 	if (!orphan)
@@ -3047,17 +3027,15 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
 
 	ret = gen_unique_name(sctx, ino, gen, orphan);
 	if (ret < 0)
-		goto out;
+		return ret;
 
 	ret = send_rename(sctx, path, orphan);
 	if (ret < 0)
-		goto out;
+		return ret;
 
 	if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen)
 		ret = fs_path_copy(&sctx->cur_inode_path, orphan);
 
-out:
-	fs_path_free(orphan);
 	return ret;
 }
 
@@ -3349,7 +3327,7 @@ static int add_pending_dir_move(struct send_ctx *sctx,
 	struct rb_node *parent = NULL;
 	struct pending_dir_move *entry = NULL, *pm;
 	struct recorded_ref *cur;
-	int exists = 0;
+	bool exists = false;
 	int ret;
 
 	pm = kmalloc_obj(*pm);
@@ -3370,7 +3348,7 @@ static int add_pending_dir_move(struct send_ctx *sctx,
 		} else if (parent_ino > entry->parent_ino) {
 			p = &(*p)->rb_right;
 		} else {
-			exists = 1;
+			exists = true;
 			break;
 		}
 	}
@@ -3467,9 +3445,9 @@ static int path_loop(struct send_ctx *sctx, struct fs_path *name,
 
 static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 {
-	struct fs_path *from_path = NULL;
-	struct fs_path *to_path = NULL;
-	struct fs_path *name = NULL;
+	struct fs_path *from_path __free(fs_path_free) = NULL;
+	struct fs_path *to_path __free(fs_path_free) = NULL;
+	struct fs_path *name __free(fs_path_free) = NULL;
 	u64 orig_progress = sctx->send_progress;
 	struct recorded_ref *cur;
 	u64 parent_ino, parent_gen;
@@ -3482,10 +3460,8 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 
 	name = fs_path_alloc();
 	from_path = fs_path_alloc();
-	if (!name || !from_path) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	if (!name || !from_path)
+		return -ENOMEM;
 
 	dm = get_waiting_dir_move(sctx, pm->ino);
 	ASSERT(dm);
@@ -3599,9 +3575,6 @@ finish:
 	}
 
 out:
-	fs_path_free(name);
-	fs_path_free(from_path);
-	fs_path_free(to_path);
 	sctx->send_progress = orig_progress;
 
 	return ret;
@@ -4157,7 +4130,7 @@ static int rename_current_inode(struct send_ctx *sctx,
 /*
  * This does all the move/link/unlink/rmdir magic.
  */
-static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
+static int process_recorded_refs(struct send_ctx *sctx, bool *pending_move)
 {
 	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
 	int ret = 0;
@@ -4417,7 +4390,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
 				goto out;
 			if (ret == 1) {
 				can_rename = false;
-				*pending_move = 1;
+				*pending_move = true;
 			}
 		}
 
@@ -4428,7 +4401,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
 				goto out;
 			if (ret == 1) {
 				can_rename = false;
-				*pending_move = 1;
+				*pending_move = true;
 			}
 		}
 
@@ -4793,7 +4766,7 @@ static int process_all_refs(struct send_ctx *sctx,
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	iterate_inode_ref_t cb;
-	int pending_move = 0;
+	bool pending_move = false;
 
 	path = alloc_path_for_send();
 	if (!path)
@@ -6524,8 +6497,7 @@ static int process_all_extents(struct send_ctx *sctx)
 }
 
 static int process_recorded_refs_if_needed(struct send_ctx *sctx, bool at_end,
-					   int *pending_move,
-					   int *refs_processed)
+					   bool *pending_move, bool *refs_processed)
 {
 	int ret;
 
@@ -6543,7 +6515,7 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, bool at_end,
 	if (ret < 0)
 		return ret;
 
-	*refs_processed = 1;
+	*refs_processed = true;
 	return 0;
 }
 
@@ -6559,12 +6531,12 @@ static int finish_inode_if_needed(struct send_ctx *sctx, bool at_end)
 	u64 right_uid;
 	u64 right_gid;
 	u64 right_fileattr;
-	int need_chmod = 0;
-	int need_chown = 0;
+	bool need_chmod = false;
+	bool need_chown = false;
 	bool need_fileattr = false;
-	int need_truncate = 1;
-	int pending_move = 0;
-	int refs_processed = 0;
+	bool need_truncate = true;
+	bool pending_move = false;
+	bool refs_processed = false;
 
 	if (sctx->ignore_cur_inode)
 		return 0;
@@ -6602,11 +6574,11 @@ static int finish_inode_if_needed(struct send_ctx *sctx, bool at_end)
 	left_fileattr = info.fileattr;
 
 	if (!sctx->parent_root || sctx->cur_inode_new) {
-		need_chown = 1;
+		need_chown = true;
 		if (!S_ISLNK(sctx->cur_inode_mode))
-			need_chmod = 1;
+			need_chmod = true;
 		if (sctx->cur_inode_next_write_offset == sctx->cur_inode_size)
-			need_truncate = 0;
+			need_truncate = false;
 	} else {
 		u64 old_size;
 
@@ -6620,15 +6592,15 @@ static int finish_inode_if_needed(struct send_ctx *sctx, bool at_end)
 		right_fileattr = info.fileattr;
 
 		if (left_uid != right_uid || left_gid != right_gid)
-			need_chown = 1;
+			need_chown = true;
 		if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
-			need_chmod = 1;
+			need_chmod = true;
 		if (!S_ISLNK(sctx->cur_inode_mode) && left_fileattr != right_fileattr)
 			need_fileattr = true;
 		if ((old_size == sctx->cur_inode_size) ||
 		    (sctx->cur_inode_size > old_size &&
 		     sctx->cur_inode_next_write_offset == sctx->cur_inode_size))
-			need_truncate = 0;
+			need_truncate = false;
 	}
 
 	if (S_ISREG(sctx->cur_inode_mode)) {
@@ -7986,7 +7958,7 @@ long btrfs_ioctl_send(struct btrfs_root *send_root, const struct btrfs_ioctl_sen
 	u64 *clone_sources_tmp = NULL;
 	int clone_sources_to_rollback = 0;
 	size_t alloc_size;
-	int sort_clone_roots = 0;
+	bool sort_clone_roots = false;
 	struct btrfs_lru_cache_entry *entry;
 	struct btrfs_lru_cache_entry *tmp;
 
@@ -8209,7 +8181,7 @@ long btrfs_ioctl_send(struct btrfs_root *send_root, const struct btrfs_ioctl_sen
 	sort(sctx->clone_roots, sctx->clone_roots_cnt,
 			sizeof(*sctx->clone_roots), __clone_root_cmp_sort,
 			NULL);
-	sort_clone_roots = 1;
+	sort_clone_roots = true;
 
 	ret = flush_delalloc_roots(sctx);
 	if (ret)
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index f0436eea1544..e6641597b321 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1411,6 +1411,13 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
  *   This is where we reclaim all of the pinned space generated by running the
  *   iputs
  *
+ * RECLAIM_ZONES
+ *   This state only works for the zoned mode. We scan the block groups in the
+ *   reclaim_bgs_list and check if we can relocate them. If yes perform the
+ *   relocation to garbage collect the zone. On each of these runs
+ *   BTRFS_ZONED_SYNC_RECLAIM_BATCH (5) block-groups will be reclaimed, after all
+ *   unused block-groups have been deleted.
+ *
  * RESET_ZONES
  *   This state works only for the zoned mode. We scan the unused block group
  *   list and reset the zones and reuse the block group.
@@ -1698,6 +1705,7 @@ static int handle_reserve_ticket(struct btrfs_space_info *space_info,
 						ARRAY_SIZE(evict_flush_states));
 		break;
 	case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE:
+	case BTRFS_RESERVE_FLUSH_ZONED_RELOCATION:
 		priority_reclaim_data_space(space_info, ticket);
 		break;
 	default:
@@ -1961,6 +1969,7 @@ int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes,
 
 	ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
 	       flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE ||
+	       flush == BTRFS_RESERVE_FLUSH_ZONED_RELOCATION ||
 	       flush == BTRFS_RESERVE_NO_FLUSH, "flush=%d", flush);
 	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA,
 	       "current->journal_info=0x%lx flush=%d",
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 24f45072ca4b..aa836e8a9d4a 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -78,6 +78,17 @@ enum btrfs_reserve_flush_enum {
 	BTRFS_RESERVE_FLUSH_ALL_STEAL,
 
 	/*
+	 * This is for relocation on zoned filesystems only. We need to use
+	 * priority flushing for this, because otherwise we can deadlock on
+	 * waiting for a ticket, that cannot be granted, because we cannot do
+	 * any allocations.
+	 *
+	 * Apart from being specific to zoned relocation, it is equal to
+	 * BTRFS_FLUSH_FREE_SPACE_INODE.
+	 */
+	BTRFS_RESERVE_FLUSH_ZONED_RELOCATION,
+
+	/*
 	 * This is for btrfs_use_block_rsv only.  We have exhausted our block
 	 * rsv and our global block rsv.  This can happen for things like
 	 * delalloc where we are overwriting a lot of extents with a single
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index f82e71f5d88b..56060acac2e9 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -10,41 +10,13 @@
  *
  * Limitations:
  *
- * - Only support 64K page size for now
- *   This is to make metadata handling easier, as 64K page would ensure
- *   all nodesize would fit inside one page, thus we don't need to handle
- *   cases where a tree block crosses several pages.
+ * - Metadata must be fully aligned to node size
+ *   So when nodesize <= page size, the metadata can never cross folio boundaries.
  *
- * - Only metadata read-write for now
- *   The data read-write part is in development.
- *
- * - Metadata can't cross 64K page boundary
- *   btrfs-progs and kernel have done that for a while, thus only ancient
- *   filesystems could have such problem.  For such case, do a graceful
- *   rejection.
- *
- * Special behavior:
- *
- * - Metadata
- *   Metadata read is fully supported.
- *   Meaning when reading one tree block will only trigger the read for the
- *   needed range, other unrelated range in the same page will not be touched.
- *
- *   Metadata write support is partial.
- *   The writeback is still for the full page, but we will only submit
- *   the dirty extent buffers in the page.
- *
- *   This means, if we have a metadata page like this:
- *
- *   Page offset
- *   0         16K         32K         48K        64K
- *   |/////////|           |///////////|
- *        \- Tree block A        \- Tree block B
- *
- *   Even if we just want to writeback tree block A, we will also writeback
- *   tree block B if it's also dirty.
- *
- *   This may cause extra metadata writeback which results more COW.
+ * - Only support blocks per folio <= min(BTRFS_MAX_FOLIO_SIZE / fs block size,
+ *					  BTRFS_MAX_BLOCKS_PER_FOLIO)
+ *   This is to ensure we can afford an on-stack bitmap, without the need to allocate
+ *   bitmap memory at runtime.
  *
  * Implementation:
  *
@@ -224,11 +196,8 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
 					    struct folio *folio, u64 start, u32 len)
 {
 	struct btrfs_folio_state *bfs = folio_get_private(folio);
-	const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
 	const int nbits = (len >> fs_info->sectorsize_bits);
 	unsigned long flags;
-	unsigned int cleared = 0;
-	int bit = start_bit;
 	bool last;
 
 	btrfs_subpage_assert(fs_info, folio, start, len);
@@ -245,15 +214,10 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
 		spin_unlock_irqrestore(&bfs->lock, flags);
 		return true;
 	}
-
-	for_each_set_bit_from(bit, bfs->bitmaps, start_bit + nbits) {
-		clear_bit(bit, bfs->bitmaps);
-		cleared++;
-	}
-	ASSERT(atomic_read(&bfs->nr_locked) >= cleared,
-	       "atomic_read(&bfs->nr_locked)=%d cleared=%d",
-	       atomic_read(&bfs->nr_locked), cleared);
-	last = atomic_sub_and_test(cleared, &bfs->nr_locked);
+	ASSERT(atomic_read(&bfs->nr_locked) >= nbits,
+	       "atomic_read(&bfs->nr_locked)=%d nbits=%d",
+	       atomic_read(&bfs->nr_locked), nbits);
+	last = atomic_sub_and_test(nbits, &bfs->nr_locked);
 	spin_unlock_irqrestore(&bfs->lock, flags);
 	return last;
 }
@@ -305,15 +269,13 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
 }
 
 void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
-				 struct folio *folio, unsigned long bitmap)
+				 struct folio *folio, unsigned long *bitmap)
 {
 	struct btrfs_folio_state *bfs = folio_get_private(folio);
 	const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
-	const int start_bit = blocks_per_folio * btrfs_bitmap_nr_locked;
+	const unsigned int nbits = bitmap_weight(bitmap, blocks_per_folio);
 	unsigned long flags;
 	bool last = false;
-	int cleared = 0;
-	int bit;
 
 	if (!btrfs_is_subpage(fs_info, folio)) {
 		folio_unlock(folio);
@@ -327,14 +289,10 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
 	}
 
 	spin_lock_irqsave(&bfs->lock, flags);
-	for_each_set_bit(bit, &bitmap, blocks_per_folio) {
-		if (test_and_clear_bit(bit + start_bit, bfs->bitmaps))
-			cleared++;
-	}
-	ASSERT(atomic_read(&bfs->nr_locked) >= cleared,
-	       "atomic_read(&bfs->nr_locked)=%d cleared=%d",
-	       atomic_read(&bfs->nr_locked), cleared);
-	last = atomic_sub_and_test(cleared, &bfs->nr_locked);
+	ASSERT(atomic_read(&bfs->nr_locked) >= nbits,
+	       "atomic_read(&bfs->nr_locked)=%d nbits=%d",
+	       atomic_read(&bfs->nr_locked), nbits);
+	last = atomic_sub_and_test(nbits, &bfs->nr_locked);
 	spin_unlock_irqrestore(&bfs->lock, flags);
 	if (last)
 		folio_unlock(folio);
@@ -479,64 +437,6 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
 	spin_unlock_irqrestore(&bfs->lock, flags);
 }
 
-void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
-			       struct folio *folio, u64 start, u32 len)
-{
-	struct btrfs_folio_state *bfs = folio_get_private(folio);
-	unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
-							ordered, start, len);
-	unsigned long flags;
-
-	spin_lock_irqsave(&bfs->lock, flags);
-	bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
-	folio_set_ordered(folio);
-	spin_unlock_irqrestore(&bfs->lock, flags);
-}
-
-void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
-				 struct folio *folio, u64 start, u32 len)
-{
-	struct btrfs_folio_state *bfs = folio_get_private(folio);
-	unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
-							ordered, start, len);
-	unsigned long flags;
-
-	spin_lock_irqsave(&bfs->lock, flags);
-	bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
-	if (subpage_test_bitmap_all_zero(fs_info, folio, ordered))
-		folio_clear_ordered(folio);
-	spin_unlock_irqrestore(&bfs->lock, flags);
-}
-
-void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info,
-			       struct folio *folio, u64 start, u32 len)
-{
-	struct btrfs_folio_state *bfs = folio_get_private(folio);
-	unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
-							checked, start, len);
-	unsigned long flags;
-
-	spin_lock_irqsave(&bfs->lock, flags);
-	bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
-	if (subpage_test_bitmap_all_set(fs_info, folio, checked))
-		folio_set_checked(folio);
-	spin_unlock_irqrestore(&bfs->lock, flags);
-}
-
-void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
-				 struct folio *folio, u64 start, u32 len)
-{
-	struct btrfs_folio_state *bfs = folio_get_private(folio);
-	unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
-							checked, start, len);
-	unsigned long flags;
-
-	spin_lock_irqsave(&bfs->lock, flags);
-	bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
-	folio_clear_checked(folio);
-	spin_unlock_irqrestore(&bfs->lock, flags);
-}
-
 /*
  * Unlike set/clear which is dependent on each page status, for test all bits
  * are tested in the same way.
@@ -560,8 +460,6 @@ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info,	\
 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate);
 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty);
 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback);
-IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered);
-IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked);
 
 /*
  * Note that, in selftests (extent-io-tests), we can have empty fs_info passed
@@ -657,30 +555,55 @@ IMPLEMENT_BTRFS_PAGE_OPS(dirty, folio_mark_dirty, folio_clear_dirty_for_io,
 			 folio_test_dirty);
 IMPLEMENT_BTRFS_PAGE_OPS(writeback, folio_start_writeback, folio_end_writeback,
 			 folio_test_writeback);
-IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered,
-			 folio_test_ordered);
-IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
-			 folio_test_checked);
 
-#define GET_SUBPAGE_BITMAP(fs_info, folio, name, dst)			\
-{									\
-	const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \
-	const struct btrfs_folio_state *__bfs = folio_get_private(folio); \
-									\
-	ASSERT(__bpf <= BITS_PER_LONG);					\
-	*dst = bitmap_read(__bfs->bitmaps,				\
-			   __bpf * btrfs_bitmap_nr_##name, __bpf);	\
+#define DEFINE_GET_SUBPAGE_BITMAP(name)						\
+static inline unsigned long get_bitmap_value_##name(				\
+					const struct btrfs_fs_info *fs_info,	\
+					struct folio *folio)			\
+{										\
+	const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio);	\
+	const struct btrfs_folio_state *__bfs = folio_get_private(folio);	\
+	unsigned long value;							\
+										\
+	ASSERT(__bpf <= BITS_PER_LONG);						\
+	value = bitmap_read(__bfs->bitmaps, __bpf * btrfs_bitmap_nr_##name,	\
+			     __bpf);						\
+	return value;								\
+}										\
+static inline const unsigned long *get_bitmap_pointer_##name(			\
+					const struct btrfs_fs_info *fs_info,	\
+					struct folio *folio)			\
+{										\
+	const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio);	\
+	struct btrfs_folio_state *__bfs = folio_get_private(folio);		\
+	unsigned long *pointer;							\
+										\
+	ASSERT(__bpf >= BITS_PER_LONG);						\
+	ASSERT(IS_ALIGNED(__bpf, BITS_PER_LONG));				\
+	pointer = __bfs->bitmaps + (BIT_WORD(__bpf) * btrfs_bitmap_nr_##name);	\
+	return pointer;								\
 }
 
-#define SUBPAGE_DUMP_BITMAP(fs_info, folio, name, start, len)		\
-{									\
-	unsigned long bitmap;						\
-	const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \
-									\
-	GET_SUBPAGE_BITMAP(fs_info, folio, name, &bitmap);		\
-	btrfs_warn(fs_info,						\
-	"dumping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \
-		   start, len, folio_pos(folio), __bpf, &bitmap);	\
+DEFINE_GET_SUBPAGE_BITMAP(uptodate);
+DEFINE_GET_SUBPAGE_BITMAP(dirty);
+DEFINE_GET_SUBPAGE_BITMAP(writeback);
+
+#define SUBPAGE_DUMP_BITMAP(fs_info, folio, name, start, len)			\
+{										\
+	const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio);	\
+										\
+	if (__bpf <= BITS_PER_LONG) {						\
+		unsigned long bitmap = get_bitmap_value_##name(fs_info, folio);	\
+										\
+		btrfs_warn(fs_info,						\
+	"dumping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl",	\
+		   start, len, folio_pos(folio), __bpf, &bitmap);		\
+	} else {								\
+		btrfs_warn(fs_info,						\
+	"dumping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl",	\
+		   start, len, folio_pos(folio), __bpf,				\
+		   get_bitmap_pointer_##name(fs_info, folio));			\
+	}									\
 }
 
 /*
@@ -728,7 +651,6 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
 {
 	struct btrfs_folio_state *bfs;
 	unsigned long flags;
-	unsigned int start_bit;
 	unsigned int nbits;
 	int ret;
 
@@ -737,15 +659,8 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
 		return;
 
 	bfs = folio_get_private(folio);
-	start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
 	nbits = len >> fs_info->sectorsize_bits;
 	spin_lock_irqsave(&bfs->lock, flags);
-	/* Target range should not yet be locked. */
-	if (unlikely(!bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits))) {
-		SUBPAGE_DUMP_BITMAP(fs_info, folio, locked, start, len);
-		ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits));
-	}
-	bitmap_set(bfs->bitmaps, start_bit, nbits);
 	ret = atomic_add_return(nbits, &bfs->nr_locked);
 	ASSERT(ret <= btrfs_blocks_per_folio(fs_info, folio));
 	spin_unlock_irqrestore(&bfs->lock, flags);
@@ -778,51 +693,74 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
 {
 	struct btrfs_folio_state *bfs;
 	const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
-	unsigned long uptodate_bitmap;
-	unsigned long dirty_bitmap;
-	unsigned long writeback_bitmap;
-	unsigned long ordered_bitmap;
-	unsigned long checked_bitmap;
-	unsigned long locked_bitmap;
 	unsigned long flags;
 
 	ASSERT(folio_test_private(folio) && folio_get_private(folio));
 	ASSERT(blocks_per_folio > 1);
 	bfs = folio_get_private(folio);
 
-	spin_lock_irqsave(&bfs->lock, flags);
-	GET_SUBPAGE_BITMAP(fs_info, folio, uptodate, &uptodate_bitmap);
-	GET_SUBPAGE_BITMAP(fs_info, folio, dirty, &dirty_bitmap);
-	GET_SUBPAGE_BITMAP(fs_info, folio, writeback, &writeback_bitmap);
-	GET_SUBPAGE_BITMAP(fs_info, folio, ordered, &ordered_bitmap);
-	GET_SUBPAGE_BITMAP(fs_info, folio, checked, &checked_bitmap);
-	GET_SUBPAGE_BITMAP(fs_info, folio, locked, &locked_bitmap);
-	spin_unlock_irqrestore(&bfs->lock, flags);
-
 	dump_page(folio_page(folio, 0), "btrfs folio state dump");
+
+	if (blocks_per_folio <= BITS_PER_LONG) {
+		unsigned long uptodate;
+		unsigned long dirty;
+		unsigned long writeback;
+
+		spin_lock_irqsave(&bfs->lock, flags);
+		uptodate = get_bitmap_value_uptodate(fs_info, folio);
+		dirty = get_bitmap_value_dirty(fs_info, folio);
+		writeback = get_bitmap_value_writeback(fs_info, folio);
+
+		spin_unlock_irqrestore(&bfs->lock, flags);
+
+		btrfs_warn(fs_info,
+"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl writeback=%*pbl",
+			    start, len, folio_pos(folio),
+			    blocks_per_folio, &uptodate,
+			    blocks_per_folio, &dirty,
+			    blocks_per_folio, &writeback);
+		return;
+	}
+
+	spin_lock_irqsave(&bfs->lock, flags);
 	btrfs_warn(fs_info,
-"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
+"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl writeback=%*pbl",
 		    start, len, folio_pos(folio),
-		    blocks_per_folio, &uptodate_bitmap,
-		    blocks_per_folio, &dirty_bitmap,
-		    blocks_per_folio, &locked_bitmap,
-		    blocks_per_folio, &writeback_bitmap,
-		    blocks_per_folio, &ordered_bitmap,
-		    blocks_per_folio, &checked_bitmap);
+		    blocks_per_folio, get_bitmap_pointer_uptodate(fs_info, folio),
+		    blocks_per_folio, get_bitmap_pointer_dirty(fs_info, folio),
+		    blocks_per_folio, get_bitmap_pointer_writeback(fs_info, folio));
+	spin_unlock_irqrestore(&bfs->lock, flags);
 }
 
-void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
-				    struct folio *folio,
-				    unsigned long *ret_bitmap)
+void btrfs_copy_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
+				     struct folio *folio,
+				     unsigned long *dst)
 {
 	struct btrfs_folio_state *bfs;
+	const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
 	unsigned long flags;
+	unsigned long value;
+
+	if (blocks_per_folio == 1) {
+		value = 1;
+		bitmap_copy(dst, &value, 1);
+		return;
+	}
 
 	ASSERT(folio_test_private(folio) && folio_get_private(folio));
-	ASSERT(btrfs_blocks_per_folio(fs_info, folio) > 1);
+	ASSERT(blocks_per_folio > 1);
 	bfs = folio_get_private(folio);
 
+	if (blocks_per_folio <= BITS_PER_LONG) {
+		spin_lock_irqsave(&bfs->lock, flags);
+		value = bitmap_read(bfs->bitmaps, btrfs_bitmap_nr_dirty * blocks_per_folio,
+				    blocks_per_folio);
+		spin_unlock_irqrestore(&bfs->lock, flags);
+		bitmap_copy(dst, &value, blocks_per_folio);
+		return;
+	}
 	spin_lock_irqsave(&bfs->lock, flags);
-	GET_SUBPAGE_BITMAP(fs_info, folio, dirty, ret_bitmap);
+	bitmap_copy(dst, get_bitmap_pointer_dirty(fs_info, folio),
+		    blocks_per_folio);
 	spin_unlock_irqrestore(&bfs->lock, flags);
 }
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index d81a0ade559f..c6d7394e6418 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -14,15 +14,15 @@ struct folio;
 /*
  * Extra info for subpage bitmap.
  *
- * For subpage we pack all uptodate/dirty/writeback/ordered bitmaps into
+ * For subpage we pack all uptodate/dirty/writeback bitmaps into
  * one larger bitmap.
  *
  * This structure records how they are organized in the bitmap:
  *
- * /- uptodate          /- dirty        /- ordered
+ * /- uptodate          /- dirty        /- writeback
  * |			|		|
  * v			v		v
- * |u|u|u|u|........|u|u|d|d|.......|d|d|o|o|.......|o|o|
+ * |u|u|u|u|........|u|u|d|d|.......|d|d|w|w|.......|w|w|
  * |< sectors_per_page >|
  *
  * Unlike regular macro-like enums, here we do not go upper-case names, as
@@ -40,23 +40,6 @@ enum {
 	 */
 	btrfs_bitmap_nr_writeback,
 
-	/*
-	 * The ordered and checked flags are for COW fixup, already marked
-	 * deprecated, and will be removed eventually.
-	 */
-	btrfs_bitmap_nr_ordered,
-	btrfs_bitmap_nr_checked,
-
-	/*
-	 * The locked bit is for async delalloc range (compression), currently
-	 * async extent is queued with the range locked, until the compression
-	 * is done.
-	 * So an async extent can unlock the range at any random timing.
-	 *
-	 * This will need a rework on the async extent lifespan (mark writeback
-	 * and do compression) before deprecating this flag.
-	 */
-	btrfs_bitmap_nr_locked,
 	btrfs_bitmap_nr_max
 };
 
@@ -133,7 +116,7 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
 void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
 			  struct folio *folio, u64 start, u32 len);
 void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
-				 struct folio *folio, unsigned long bitmap);
+				 struct folio *folio, unsigned long *bitmap);
 /*
  * Template for subpage related operations.
  *
@@ -181,8 +164,6 @@ bool btrfs_meta_folio_test_##name(struct folio *folio, const struct extent_buffe
 DECLARE_BTRFS_SUBPAGE_OPS(uptodate);
 DECLARE_BTRFS_SUBPAGE_OPS(dirty);
 DECLARE_BTRFS_SUBPAGE_OPS(writeback);
-DECLARE_BTRFS_SUBPAGE_OPS(ordered);
-DECLARE_BTRFS_SUBPAGE_OPS(checked);
 
 /*
  * Helper for error cleanup, where a folio will have its dirty flag cleared,
@@ -203,9 +184,9 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
 void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
 				  struct folio *folio, u64 start, u32 len);
 bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct extent_buffer *eb);
-void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
-				    struct folio *folio,
-				    unsigned long *ret_bitmap);
+void btrfs_copy_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
+				     struct folio *folio,
+				     unsigned long *dst);
 void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
 				      struct folio *folio, u64 start, u32 len);
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 636154861d7c..a7d804219bec 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -60,6 +60,7 @@
 #include "verity.h"
 #include "super.h"
 #include "extent-tree.h"
+#include "tree-log.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/btrfs.h>
 
@@ -1633,8 +1634,7 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
 		}
 	}
 
-	devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
-			       GFP_KERNEL);
+	devices_info = kmalloc_objs(*devices_info, nr_devices);
 	if (!devices_info)
 		return -ENOMEM;
 
@@ -1732,15 +1732,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	u64 total_free_data = 0;
 	u64 total_free_meta = 0;
 	u32 bits = fs_info->sectorsize_bits;
-	__be32 *fsid = (__be32 *)fs_info->fs_devices->fsid;
+	__be32 *fsid;
 	unsigned factor = 1;
 	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
 	int ret;
 	u64 thresh = 0;
-	int mixed = 0;
+	bool mixed = false;
+	__kernel_fsid_t f_fsid;
 
 	list_for_each_entry(found, &fs_info->space_info, list) {
-		if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
+		if (found->flags & BTRFS_BLOCK_GROUP_DATA &&
+		    found->subgroup_id != BTRFS_SUB_GROUP_DATA_RELOC) {
 			int i;
 
 			total_free_data += found->disk_total - found->disk_used;
@@ -1759,7 +1761,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 		 */
 		if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) {
 			if (found->flags & BTRFS_BLOCK_GROUP_DATA)
-				mixed = 1;
+				mixed = true;
 			else
 				total_free_meta += found->disk_total -
 					found->disk_used;
@@ -1818,14 +1820,38 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bsize = fs_info->sectorsize;
 	buf->f_namelen = BTRFS_NAME_LEN;
 
-	/* We treat it as constant endianness (it doesn't matter _which_)
-	   because we want the fsid to come out the same whether mounted
-	   on a big-endian or little-endian host */
-	buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
-	buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
+	/*
+	 * fs_devices->fsid is dynamically generated when temp_fsid is active
+	 * to support cloned filesystems. Use the original on-disk fsid instead,
+	 * as it remains consistent across mount cycles.
+	 */
+	if (fs_info->fs_devices->temp_fsid)
+		fsid = (__be32 *)fs_info->super_copy->fsid;
+	else
+		fsid = (__be32 *)fs_info->fs_devices->fsid;
+
+	/*
+	 * We treat it as constant endianness (it doesn't matter _which_)
+	 * because we want the fsid to come out the same whether mounted
+	 * on a big-endian or little-endian host.
+	 */
+	f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
+	f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
+
 	/* Mask in the root object ID too, to disambiguate subvols */
-	buf->f_fsid.val[0] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root) >> 32;
-	buf->f_fsid.val[1] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root);
+	f_fsid.val[0] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root) >> 32;
+	f_fsid.val[1] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root);
+
+	/* Hash dev_t to avoid f_fsid collision with cloned filesystems. */
+	if (fs_info->fs_devices->total_devices == 1) {
+		__kernel_fsid_t dev_fsid =
+			u64_to_fsid(huge_encode_dev(fs_info->fs_devices->latest_dev->bdev->bd_dev));
+
+		f_fsid.val[0] ^= dev_fsid.val[1];
+		f_fsid.val[1] ^= dev_fsid.val[0];
+	}
+
+	memcpy(&buf->f_fsid, &f_fsid, sizeof(f_fsid));
 
 	return 0;
 }
@@ -1873,6 +1899,7 @@ static int btrfs_get_tree_super(struct fs_context *fc)
 	fs_info->fs_devices = fs_devices;
 	mutex_unlock(&uuid_mutex);
 
+	fc->sb_flags |= SB_NOSEC;
 
 	sb = sget_fc(fc, btrfs_fc_test_super, set_anon_super_fc);
 	if (IS_ERR(sb)) {
@@ -2405,7 +2432,7 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
 static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_control *sc)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-	const s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
+	const s64 nr = percpu_counter_read_positive(&fs_info->evictable_extent_maps);
 
 	trace_btrfs_extent_map_shrinker_count(fs_info, nr);
 
@@ -2605,6 +2632,9 @@ static const struct init_sequence mod_init_seq[] = {
 		.init_func = btrfs_init_compress,
 		.exit_func = btrfs_exit_compress,
 	}, {
+		.init_func = btrfs_init_block_group,
+		.exit_func = btrfs_exit_block_group,
+	}, {
 		.init_func = btrfs_init_cachep,
 		.exit_func = btrfs_destroy_cachep,
 	}, {
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 19c127ac6d10..6287d940323d 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -145,6 +145,7 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
 	fs_info->csum_size = 4;
 	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) /
 		fs_info->csum_size;
+	fs_info->use_bitmap = btrfs_use_bitmap;
 	set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
 
 	test_mnt->mnt_sb->s_fs_info = fs_info;
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index ebf68fcd2149..0425b3b68716 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -398,10 +398,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
 	int ret;
 	u64 offset;
 	u64 max_extent_size;
-	const struct btrfs_free_space_op test_free_space_ops = {
-		.use_bitmap = test_use_bitmap,
-	};
-	const struct btrfs_free_space_op *orig_free_space_ops;
+	bool (*orig_use_bitmap)(struct btrfs_free_space_ctl *ctl,
+				struct btrfs_free_space *info);
 
 	test_msg("running space stealing from bitmap to extent tests");
 
@@ -423,8 +421,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
 	 * that forces use of bitmaps as soon as we have at least 1
 	 * extent entry.
 	 */
-	orig_free_space_ops = cache->free_space_ctl->op;
-	cache->free_space_ctl->op = &test_free_space_ops;
+	orig_use_bitmap = cache->fs_info->use_bitmap;
+	cache->fs_info->use_bitmap = test_use_bitmap;
 
 	/*
 	 * Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[
@@ -818,7 +816,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
 	if (ret)
 		return ret;
 
-	cache->free_space_ctl->op = orig_free_space_ops;
+	cache->fs_info->use_bitmap = orig_use_bitmap;
 	btrfs_remove_free_space_cache(cache);
 
 	return 0;
@@ -832,10 +830,8 @@ static bool bytes_index_use_bitmap(struct btrfs_free_space_ctl *ctl,
 
 static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
 {
-	const struct btrfs_free_space_op test_free_space_ops = {
-		.use_bitmap = bytes_index_use_bitmap,
-	};
-	const struct btrfs_free_space_op *orig_free_space_ops;
+	bool (*orig_use_bitmap)(struct btrfs_free_space_ctl *ctl,
+				struct btrfs_free_space *info);
 	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
 	struct btrfs_free_space *entry;
 	struct rb_node *node;
@@ -892,8 +888,8 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
 
 	/* Now validate bitmaps with different ->max_extent_size. */
 	btrfs_remove_free_space_cache(cache);
-	orig_free_space_ops = cache->free_space_ctl->op;
-	cache->free_space_ctl->op = &test_free_space_ops;
+	orig_use_bitmap = cache->fs_info->use_bitmap;
+	cache->fs_info->use_bitmap = bytes_index_use_bitmap;
 
 	ret = test_add_free_space_entry(cache, 0, sectorsize, 1);
 	if (ret) {
@@ -997,7 +993,7 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
 		return -EINVAL;
 	}
 
-	cache->free_space_ctl->op = orig_free_space_ops;
+	cache->fs_info->use_bitmap = orig_use_bitmap;
 	btrfs_remove_free_space_cache(cache);
 	return 0;
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 194f581b36f3..8f9419728100 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -394,6 +394,7 @@ loop:
 	cur_trans->transid = fs_info->generation;
 	fs_info->running_transaction = cur_trans;
 	cur_trans->aborted = 0;
+	trace_btrfs_transaction_start(cur_trans);
 	spin_unlock(&fs_info->trans_lock);
 
 	return 0;
@@ -630,7 +631,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
 	 * the appropriate flushing if need be.
 	 */
 	if (num_items && root != fs_info->chunk_root) {
-		qgroup_reserved = num_items * fs_info->nodesize;
+		qgroup_reserved = (num_items << fs_info->nodesize_bits);
 		/*
 		 * Use prealloc for now, as there might be a currently running
 		 * transaction that could free this reserved space prematurely
@@ -2114,7 +2115,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
 	btrfs_put_transaction(cur_trans);
 	btrfs_put_transaction(cur_trans);
 
-	trace_btrfs_transaction_commit(fs_info);
+	trace_btrfs_transaction_commit(trans);
 
 	if (current->journal_info == trans)
 		current->journal_info = NULL;
@@ -2266,7 +2267,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	btrfs_create_pending_block_groups(trans);
 
 	if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
-		int run_it = 0;
+		bool run_it = false;
 
 		/* this mutex is also taken before trying to set
 		 * block groups readonly.  We need to make sure
@@ -2284,7 +2285,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 		mutex_lock(&fs_info->ro_block_group_mutex);
 		if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
 				      &cur_trans->flags))
-			run_it = 1;
+			run_it = true;
 		mutex_unlock(&fs_info->ro_block_group_mutex);
 
 		if (run_it) {
@@ -2320,6 +2321,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	}
 
 	cur_trans->state = TRANS_STATE_COMMIT_PREP;
+	trace_btrfs_transaction_commit(trans);
 	wake_up(&fs_info->transaction_blocked_wait);
 	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
 
@@ -2358,6 +2360,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	}
 
 	cur_trans->state = TRANS_STATE_COMMIT_START;
+	trace_btrfs_transaction_commit(trans);
 	wake_up(&fs_info->transaction_blocked_wait);
 	spin_unlock(&fs_info->trans_lock);
 
@@ -2413,6 +2416,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	spin_lock(&fs_info->trans_lock);
 	add_pending_snapshot(trans);
 	cur_trans->state = TRANS_STATE_COMMIT_DOING;
+	trace_btrfs_transaction_commit(trans);
 	spin_unlock(&fs_info->trans_lock);
 
 	/*
@@ -2561,6 +2565,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 
 	spin_lock(&fs_info->trans_lock);
 	cur_trans->state = TRANS_STATE_UNBLOCKED;
+	trace_btrfs_transaction_commit(trans);
 	fs_info->running_transaction = NULL;
 	spin_unlock(&fs_info->trans_lock);
 	mutex_unlock(&fs_info->reloc_mutex);
@@ -2603,6 +2608,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	 * which can change it.
 	 */
 	cur_trans->state = TRANS_STATE_SUPER_COMMITTED;
+	trace_btrfs_transaction_commit(trans);
 	wake_up(&cur_trans->commit_wait);
 	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
 
@@ -2619,6 +2625,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	 * which can change it.
 	 */
 	cur_trans->state = TRANS_STATE_COMPLETED;
+	trace_btrfs_transaction_commit(trans);
 	wake_up(&cur_trans->commit_wait);
 	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
 
@@ -2632,8 +2639,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	if (trans->type & __TRANS_FREEZABLE)
 		sb_end_intwrite(fs_info->sb);
 
-	trace_btrfs_transaction_commit(fs_info);
-
 	btrfs_scrub_continue(fs_info);
 
 	if (current->journal_info == trans)
@@ -2722,17 +2727,33 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
  *
  * We'll complete the cleanup in btrfs_end_transaction and
  * btrfs_commit_transaction.
+ *
+ * Note: the parameter @error encodes whether the transactin abort was first hit
+ *       (setting the FS_ERROR state bit in btrfs_abort_transaction())
+ *       - positive number - first hit
+ *       - negative number - abort after it was already done
  */
 void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
 				      const char *function,
-				      unsigned int line, int error, bool first_hit)
+				      unsigned int line, int error)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
+	bool first_hit = false;
+
+	if (error > 0) {
+		error = -error;
+		first_hit = true;
+	}
 
 	WRITE_ONCE(trans->aborted, error);
 	WRITE_ONCE(trans->transaction->aborted, error);
-	if (first_hit && error == -ENOSPC)
-		btrfs_dump_space_info_for_trans_abort(fs_info);
+	trace_btrfs_transaction_abort(trans);
+	if (first_hit) {
+		btrfs_err(fs_info, "Transaction %llu aborted (error %d)",
+			  trans->transid, error);
+		if (error == -ENOSPC)
+			btrfs_dump_space_info_for_trans_abort(fs_info);
+	}
 	/* Wake up anybody who may be waiting on this transaction */
 	wake_up(&fs_info->transaction_wait);
 	wake_up(&fs_info->transaction_blocked_wait);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 7d70fe486758..5e4b1106fd90 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -243,29 +243,47 @@ static inline bool btrfs_abort_should_print_stack(int error)
 }
 
 /*
- * Call btrfs_abort_transaction as early as possible when an error condition is
- * detected, that way the exact stack trace is reported for some errors.
+ * Compile-time and run-time verification of error passed to transaction abort.
+ * Direct constants will be caught at compile time, errors read from variables
+ * can be caught only at run-time and will warn under debugging config.
+ *
+ * How verification works:
+ * - accepted builtin constants are all -EIO and such
+ * - for compile-time check, invalid condition produces a negative-sized array
+ *   type, valid zero-sized
+ * - when a variable is passed as error the first check is a no-op
+ * - with enabled debugging, the second array type size is constructed from the
+ *   real variable value, valid condition produces array of size 1
+ * - sizeof(type) does not generate any code
+ */
+#define VERIFY_NEGATIVE_ERROR(error)						\
+do {										\
+	(void)sizeof(char[-!(__builtin_constant_p(error) ? (error) < 0 : 1)]);	\
+	if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {					\
+		if (sizeof(char[(error) < 0]) != 1)				\
+			DEBUG_WARN("error >= 0 passed to btrfs_abort_transaction()"); \
+	}									\
+} while(0)
+
+/*
+ * Call btrfs_abort_transaction() as early as possible when an error condition
+ * is detected, that way the exact stack trace is reported for some errors.
+ *
+ * Error number must be negative as it encodes wheather it's the first abort.
  */
 #define btrfs_abort_transaction(trans, error)		\
 do {								\
-	bool __first = false;					\
+	int __error = (error);					\
+								\
+	VERIFY_NEGATIVE_ERROR(error);				\
 	/* Report first abort since mount */			\
 	if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,	\
 			&((trans)->fs_info->fs_state))) {	\
-		__first = true;					\
-		if (WARN(btrfs_abort_should_print_stack(error),	\
-			KERN_ERR				\
-			"BTRFS: Transaction aborted (error %d)\n",	\
-			(error))) {					\
-			/* Stack trace printed. */			\
-		} else {						\
-			btrfs_err((trans)->fs_info,			\
-				  "Transaction aborted (error %d)",	\
-				  (error));			\
-		}						\
+		WARN_ON(btrfs_abort_should_print_stack(__error)); \
+		__error = -__error;				\
 	}							\
 	__btrfs_abort_transaction((trans), __func__,		\
-				  __LINE__, (error), __first);	\
+				  __LINE__, __error);		\
 } while (0)
 
 int btrfs_end_transaction(struct btrfs_trans_handle *trans);
@@ -303,7 +321,7 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
 void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
 				      const char *function,
-				      unsigned int line, int error, bool first_hit);
+				      unsigned int line, int error);
 
 int __init btrfs_transaction_init(void);
 void __cold btrfs_transaction_exit(void);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 1f15d0793a9c..cb3e676a81cc 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -296,6 +296,33 @@ static int check_extent_data_item(struct extent_buffer *leaf,
 		return 0;
 	}
 
+	/*
+	 * For the data reloc tree, file extent items are written by
+	 * relocation's own paths. The data reloc inode is created with
+	 * BTRFS_INODE_NOCOMPRESS, so insert_ordered_extent_file_extent()
+	 * always leaves the compression field at 0. Encryption and
+	 * other_encoding are reserved-and-zero in btrfs. A non-zero value
+	 * for any of these means the leaf decoded from disk does not match
+	 * what the kernel wrote, i.e. on-disk corruption.
+	 *
+	 * The file_extent_item's offset field is NOT a universal invariant
+	 * here: partial-PREALLOC writebacks legitimately produce REG items
+	 * with non-zero offset at non-boundary keys. The offset check is
+	 * performed at the call site in get_new_location(), which only
+	 * inspects cluster-boundary keys where offset is always 0.
+	 */
+	if (unlikely(btrfs_header_owner(leaf) == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+		     (btrfs_file_extent_compression(leaf, fi) ||
+		      btrfs_file_extent_encryption(leaf, fi) ||
+		      btrfs_file_extent_other_encoding(leaf, fi)))) {
+		file_extent_err(leaf, slot,
+"invalid encoding fields for data reloc tree, compression=%u encryption=%u other_encoding=%u",
+				btrfs_file_extent_compression(leaf, fi),
+				btrfs_file_extent_encryption(leaf, fi),
+				btrfs_file_extent_other_encoding(leaf, fi));
+		return -EUCLEAN;
+	}
+
 	/* Regular or preallocated extent has fixed item size */
 	if (unlikely(item_size != sizeof(*fi))) {
 		file_extent_err(leaf, slot,
@@ -1371,6 +1398,37 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
 	return 0;
 }
 
+static int check_root_ref(struct extent_buffer *leaf, struct btrfs_key *key, int slot)
+{
+	struct btrfs_root_ref *rref;
+	u32 item_size = btrfs_item_size(leaf, slot);
+	u32 name_len;
+
+	if (unlikely(item_size <= sizeof(*rref))) {
+		generic_err(leaf, slot,
+			    "invalid root ref item size for key type %u, have %u expect > %zu",
+			    key->type, item_size, sizeof(*rref));
+		return -EUCLEAN;
+	}
+
+	rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
+	name_len = btrfs_root_ref_name_len(leaf, rref);
+	if (unlikely(name_len > BTRFS_NAME_LEN)) {
+		generic_err(leaf, slot,
+			    "root ref name too long for key type %u, have %u max %u",
+			    key->type, name_len, BTRFS_NAME_LEN);
+		return -EUCLEAN;
+	}
+	if (unlikely(item_size != sizeof(*rref) + name_len)) {
+		generic_err(leaf, slot,
+			    "invalid root ref item size for key type %u, have %u expect %zu",
+			    key->type, item_size, sizeof(*rref) + name_len);
+		return -EUCLEAN;
+	}
+
+	return 0;
+}
+
 __printf(3,4)
 __cold
 static void extent_err(const struct extent_buffer *eb, int slot,
@@ -2071,6 +2129,7 @@ static int check_free_space_info(struct extent_buffer *leaf, struct btrfs_key *k
 	struct btrfs_fs_info *fs_info = leaf->fs_info;
 	struct btrfs_free_space_info *fsi;
 	const u32 blocksize = fs_info->sectorsize;
+	u64 end;
 	u32 flags;
 
 	if (unlikely(!IS_ALIGNED(key->objectid, blocksize))) {
@@ -2085,6 +2144,12 @@ static int check_free_space_info(struct extent_buffer *leaf, struct btrfs_key *k
 			    blocksize, BTRFS_KEY_FMT_VALUE(key));
 		return -EUCLEAN;
 	}
+	if (unlikely(check_add_overflow(key->objectid, key->offset, &end))) {
+		generic_err(leaf, slot,
+			    "free space info key overflows, has " BTRFS_KEY_FMT,
+			    BTRFS_KEY_FMT_VALUE(key));
+		return -EUCLEAN;
+	}
 	if (unlikely(btrfs_item_size(leaf, slot) !=
 		     sizeof(struct btrfs_free_space_info))) {
 		generic_err(leaf, slot,
@@ -2112,23 +2177,98 @@ static int check_free_space_info(struct extent_buffer *leaf, struct btrfs_key *k
 	return 0;
 }
 
-static int check_free_space_extent(struct extent_buffer *leaf, struct btrfs_key *key, int slot)
+static int check_free_space_common_key(struct extent_buffer *leaf, struct btrfs_key *key, int slot,
+				       struct btrfs_key *prev_key)
 {
 	struct btrfs_fs_info *fs_info = leaf->fs_info;
 	const u32 blocksize = fs_info->sectorsize;
+	const char *type_str = (key->type == BTRFS_FREE_SPACE_EXTENT_KEY) ? "extent" : "bitmap";
+	u64 end;
 
 	if (unlikely(!IS_ALIGNED(key->objectid, blocksize))) {
 		generic_err(leaf, slot,
-		"free space extent key objectid is not aligned to %u, has " BTRFS_KEY_FMT,
-			    blocksize, BTRFS_KEY_FMT_VALUE(key));
+		"free space %s key objectid is not aligned to %u, has " BTRFS_KEY_FMT,
+			    type_str, blocksize, BTRFS_KEY_FMT_VALUE(key));
 		return -EUCLEAN;
 	}
 	if (unlikely(!IS_ALIGNED(key->offset, blocksize))) {
 		generic_err(leaf, slot,
-		"free space extent key offset is not aligned to %u, has " BTRFS_KEY_FMT,
-			    blocksize, BTRFS_KEY_FMT_VALUE(key));
+		"free space %s key offset is not aligned to %u, has " BTRFS_KEY_FMT,
+			    type_str, blocksize, BTRFS_KEY_FMT_VALUE(key));
+		return -EUCLEAN;
+	}
+	if (unlikely(key->offset == 0)) {
+		generic_err(leaf, slot, "free space %s length is 0", type_str);
+		return -EUCLEAN;
+	}
+	if (unlikely(check_add_overflow(key->objectid, key->offset, &end))) {
+		generic_err(leaf, slot,
+			    "free space %s end overflow, have objectid %llu offset %llu",
+			    type_str, key->objectid, key->offset);
+		return -EUCLEAN;
+	}
+	if (slot == 0)
+		return 0;
+
+	/*
+	 * Make sure the current key is inside the block group, and matching
+	 * the expected info type.
+	 */
+	if (prev_key->type == BTRFS_FREE_SPACE_INFO_KEY) {
+		struct btrfs_free_space_info *fsi;
+		u32 info_flags;
+
+		if (unlikely(key->objectid < prev_key->objectid ||
+			     key->objectid + key->offset > prev_key->objectid + prev_key->offset)) {
+			generic_err(leaf, slot,
+"free space %s is not inside the space info, prev key " BTRFS_KEY_FMT " current key " BTRFS_KEY_FMT,
+				    type_str, BTRFS_KEY_FMT_VALUE(prev_key),
+				    BTRFS_KEY_FMT_VALUE(key));
+			return -EUCLEAN;
+		}
+		fsi = btrfs_item_ptr(leaf, slot - 1, struct btrfs_free_space_info);
+		info_flags = btrfs_free_space_flags(leaf, fsi);
+		if (unlikely((info_flags == BTRFS_FREE_SPACE_USING_BITMAPS &&
+			      key->type == BTRFS_FREE_SPACE_EXTENT_KEY) ||
+			     (info_flags != BTRFS_FREE_SPACE_USING_BITMAPS &&
+			      key->type == BTRFS_FREE_SPACE_BITMAP_KEY))) {
+			generic_err(leaf, slot,
+"free space %s key type is not matching the type of space info, key type %u space info flags %u",
+				    type_str, key->type, info_flags);
+			return -EUCLEAN;
+		}
+		return 0;
+	}
+	/*
+	 * Previous key should be either FREE_SPACE_EXTENT or FREE_SPACE_BITMAP.
+	 * Inside the same block group the key type should match each other, and
+	 * no overlaps.
+	 */
+	if (unlikely(key->type != prev_key->type)) {
+		generic_err(leaf, slot,
+"free space %s key type is not matching the type of previous key, key type %u prev key type %u",
+			    type_str, key->type, prev_key->type);
+		return -EUCLEAN;
+	}
+	if (unlikely(prev_key->objectid + prev_key->offset > key->objectid)) {
+		generic_err(leaf, slot,
+"free space %s key overlaps previous key, prev key " BTRFS_KEY_FMT " current key " BTRFS_KEY_FMT,
+			    type_str, BTRFS_KEY_FMT_VALUE(prev_key),
+			    BTRFS_KEY_FMT_VALUE(key));
 		return -EUCLEAN;
 	}
+	return 0;
+}
+
+static int check_free_space_extent(struct extent_buffer *leaf, struct btrfs_key *key, int slot,
+				   struct btrfs_key *prev_key)
+{
+	int ret;
+
+	ret = check_free_space_common_key(leaf, key, slot, prev_key);
+	if (unlikely(ret < 0))
+		return ret;
+
 	if (unlikely(btrfs_item_size(leaf, slot) != 0)) {
 		generic_err(leaf, slot,
 			    "invalid item size for free space info, has %u expect 0",
@@ -2139,28 +2279,17 @@ static int check_free_space_extent(struct extent_buffer *leaf, struct btrfs_key
 }
 
 static int check_free_space_bitmap(struct extent_buffer *leaf,
-				   struct btrfs_key *key, int slot)
+				   struct btrfs_key *key, int slot,
+				   struct btrfs_key *prev_key)
 {
 	struct btrfs_fs_info *fs_info = leaf->fs_info;
-	const u32 blocksize = fs_info->sectorsize;
 	u32 expected_item_size;
+	int ret;
+
+	ret = check_free_space_common_key(leaf, key, slot, prev_key);
+	if (unlikely(ret < 0))
+		return ret;
 
-	if (unlikely(!IS_ALIGNED(key->objectid, blocksize))) {
-		generic_err(leaf, slot,
-		"free space bitmap key objectid is not aligned to %u, has " BTRFS_KEY_FMT,
-			    blocksize, BTRFS_KEY_FMT_VALUE(key));
-		return -EUCLEAN;
-	}
-	if (unlikely(!IS_ALIGNED(key->offset, blocksize))) {
-		generic_err(leaf, slot,
-		"free space bitmap key offset is not aligned to %u, has " BTRFS_KEY_FMT,
-			    blocksize, BTRFS_KEY_FMT_VALUE(key));
-		return -EUCLEAN;
-	}
-	if (unlikely(key->offset == 0)) {
-		generic_err(leaf, slot, "free space bitmap length is 0");
-		return -EUCLEAN;
-	}
 	/*
 	 * The item must hold exactly the right number of bitmap bytes for the
 	 * range described by key->offset.  A mismatch means the item was
@@ -2226,6 +2355,10 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf,
 	case BTRFS_ROOT_ITEM_KEY:
 		ret = check_root_item(leaf, key, slot);
 		break;
+	case BTRFS_ROOT_REF_KEY:
+	case BTRFS_ROOT_BACKREF_KEY:
+		ret = check_root_ref(leaf, key, slot);
+		break;
 	case BTRFS_EXTENT_ITEM_KEY:
 	case BTRFS_METADATA_ITEM_KEY:
 		ret = check_extent_item(leaf, key, slot, prev_key);
@@ -2245,10 +2378,10 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf,
 		ret = check_free_space_info(leaf, key, slot);
 		break;
 	case BTRFS_FREE_SPACE_EXTENT_KEY:
-		ret = check_free_space_extent(leaf, key, slot);
+		ret = check_free_space_extent(leaf, key, slot, prev_key);
 		break;
 	case BTRFS_FREE_SPACE_BITMAP_KEY:
-		ret = check_free_space_bitmap(leaf, key, slot);
+		ret = check_free_space_bitmap(leaf, key, slot, prev_key);
 		break;
 	case BTRFS_IDENTITY_REMAP_KEY:
 	case BTRFS_REMAP_KEY:
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9123adafa0d1..875e4ddc68ea 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -33,17 +33,6 @@
 
 #define MAX_CONFLICT_INODES 10
 
-/* magic values for the inode_only field in btrfs_log_inode:
- *
- * LOG_INODE_ALL means to log everything
- * LOG_INODE_EXISTS means to log just enough to recreate the inode
- * during log replay
- */
-enum {
-	LOG_INODE_ALL,
-	LOG_INODE_EXISTS,
-};
-
 /*
  * directory trouble cases
  *
@@ -227,7 +216,7 @@ static void do_abort_log_replay(struct walk_control *wc, const char *function,
 
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 			   struct btrfs_inode *inode,
-			   int inode_only,
+			   enum btrfs_log_mode log_mode,
 			   struct btrfs_log_ctx *ctx);
 static int link_to_fixup_dir(struct walk_control *wc, u64 objectid);
 static noinline int replay_dir_deletes(struct walk_control *wc,
@@ -502,7 +491,7 @@ static int overwrite_item(struct walk_control *wc)
 	 * the leaf before writing into the log tree. See the comments at
 	 * copy_items() for more details.
 	 */
-	ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID, "root_id=%llu", btrfs_root_id(root));
+	ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
 
 	item_size = btrfs_item_size(wc->log_leaf, wc->log_slot);
 	src_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot);
@@ -3333,8 +3322,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	u64 log_root_level;
 
 	mutex_lock(&root->log_mutex);
+	trace_btrfs_sync_log_enter(trans, root, ctx);
 	log_transid = ctx->log_transid;
 	if (root->log_transid_committed >= log_transid) {
+		trace_btrfs_sync_log_exit(trans, root, ctx, ctx->log_ret);
 		mutex_unlock(&root->log_mutex);
 		return ctx->log_ret;
 	}
@@ -3342,6 +3333,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	index1 = log_transid % 2;
 	if (atomic_read(&root->log_commit[index1])) {
 		wait_log_commit(root, log_transid);
+		trace_btrfs_sync_log_exit(trans, root, ctx, ctx->log_ret);
 		mutex_unlock(&root->log_mutex);
 		return ctx->log_ret;
 	}
@@ -3370,6 +3362,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	/* bail out if we need to do a full commit */
 	if (btrfs_need_log_full_commit(trans)) {
 		ret = BTRFS_LOG_FORCE_COMMIT;
+		trace_btrfs_sync_log_exit(trans, root, ctx, ret);
 		mutex_unlock(&root->log_mutex);
 		goto out;
 	}
@@ -3396,6 +3389,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
 		ret = 0;
 	if (ret) {
+		trace_btrfs_sync_log_exit(trans, root, ctx, ret);
 		blk_finish_plug(&plug);
 		btrfs_set_log_full_commit(trans);
 		mutex_unlock(&root->log_mutex);
@@ -3433,6 +3427,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		if (!log_root_tree->node) {
 			ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
 			if (ret) {
+				trace_btrfs_sync_log_exit(trans, root, ctx, ret);
 				mutex_unlock(&fs_info->tree_root->log_mutex);
 				blk_finish_plug(&plug);
 				goto out;
@@ -3456,6 +3451,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 */
 	ret = update_log_root(trans, log, &new_root_item);
 	if (ret) {
+		trace_btrfs_sync_log_exit(trans, root, ctx, ret);
 		list_del_init(&root_log_ctx.list);
 		blk_finish_plug(&plug);
 		btrfs_set_log_full_commit(trans);
@@ -3473,6 +3469,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		list_del_init(&root_log_ctx.list);
 		mutex_unlock(&log_root_tree->log_mutex);
 		ret = root_log_ctx.log_ret;
+		trace_btrfs_sync_log_exit(trans, root, ctx, ret);
 		goto out;
 	}
 
@@ -3484,6 +3481,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		mutex_unlock(&log_root_tree->log_mutex);
 		if (!ret)
 			ret = root_log_ctx.log_ret;
+		trace_btrfs_sync_log_exit(trans, root, ctx, ret);
 		goto out;
 	}
 	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid,
@@ -3505,6 +3503,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		btrfs_wait_tree_log_extents(log, mark);
 		mutex_unlock(&log_root_tree->log_mutex);
 		ret = BTRFS_LOG_FORCE_COMMIT;
+		trace_btrfs_sync_log_exit(trans, root, ctx, ret);
 		goto out_wake_log_root;
 	}
 
@@ -3518,11 +3517,13 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 * deadlock. Bail out to the full commit instead.
 	 */
 	if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
+		trace_btrfs_sync_log_exit(trans, root, ctx, ret);
 		btrfs_set_log_full_commit(trans);
 		btrfs_wait_tree_log_extents(log, mark);
 		mutex_unlock(&log_root_tree->log_mutex);
 		goto out_wake_log_root;
 	} else if (ret) {
+		trace_btrfs_sync_log_exit(trans, root, ctx, ret);
 		btrfs_set_log_full_commit(trans);
 		mutex_unlock(&log_root_tree->log_mutex);
 		goto out_wake_log_root;
@@ -3532,6 +3533,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		ret = btrfs_wait_tree_log_extents(log_root_tree,
 						  EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
 	if (ret) {
+		trace_btrfs_sync_log_exit(trans, root, ctx, ret);
 		btrfs_set_log_full_commit(trans);
 		mutex_unlock(&log_root_tree->log_mutex);
 		goto out_wake_log_root;
@@ -3568,6 +3570,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 */
 	if (unlikely(BTRFS_FS_ERROR(fs_info))) {
 		ret = -EIO;
+		trace_btrfs_sync_log_exit(trans, root, ctx, ret);
 		btrfs_set_log_full_commit(trans);
 		btrfs_abort_transaction(trans, ret);
 		mutex_unlock(&fs_info->tree_log_mutex);
@@ -3579,6 +3582,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	ret = write_all_supers(trans);
 	mutex_unlock(&fs_info->tree_log_mutex);
 	if (unlikely(ret)) {
+		trace_btrfs_sync_log_exit(trans, root, ctx, ret);
 		btrfs_set_log_full_commit(trans);
 		btrfs_abort_transaction(trans, ret);
 		goto out_wake_log_root;
@@ -4771,7 +4775,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 			       struct btrfs_inode *inode,
 			       struct btrfs_path *dst_path,
 			       struct btrfs_path *src_path,
-			       int start_slot, int nr, int inode_only,
+			       int start_slot, int nr, enum btrfs_log_mode log_mode,
 			       u64 logged_isize, struct btrfs_log_ctx *ctx)
 {
 	struct btrfs_root *log = inode->root->log_root;
@@ -4985,7 +4989,7 @@ copy_item:
 			inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot,
 						    struct btrfs_inode_item);
 			fill_inode_item(trans, dst_path->nodes[0], inode_item,
-					inode, inode_only == LOG_INODE_EXISTS,
+					inode, log_mode == LOG_INODE_EXISTS,
 					logged_isize);
 		} else {
 			copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
@@ -5913,9 +5917,13 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
 	struct btrfs_inode *curr_inode = start_inode;
 	int ret = 0;
 
+	trace_btrfs_log_new_dir_dentries_enter(trans, start_inode);
+
 	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	/* Pairs with btrfs_add_delayed_iput below. */
 	ihold(&curr_inode->vfs_inode);
@@ -6034,6 +6042,8 @@ out:
 			kfree(dir_elem);
 	}
 
+	trace_btrfs_log_new_dir_dentries_exit(trans, start_inode, ret);
+
 	return ret;
 }
 
@@ -6126,6 +6136,9 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_ino_list *ino_elem;
 	struct btrfs_inode *inode;
+	int ret = 0;
+
+	trace_btrfs_add_conflicting_inode_enter(trans, ctx, ino, parent);
 
 	/*
 	 * It's rare to have a lot of conflicting inodes, in practice it is not
@@ -6134,8 +6147,10 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
 	 * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
 	 * commits.
 	 */
-	if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
-		return BTRFS_LOG_FORCE_COMMIT;
+	if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) {
+		ret = BTRFS_LOG_FORCE_COMMIT;
+		goto out;
+	}
 
 	inode = btrfs_iget_logging(ino, root);
 	/*
@@ -6159,26 +6174,27 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
 	 *    some inode from it to some other directory).
 	 */
 	if (IS_ERR(inode)) {
-		int ret = PTR_ERR(inode);
-
+		ret = PTR_ERR(inode);
 		if (ret != -ENOENT)
-			return ret;
+			goto out;
 
 		ret = conflicting_inode_is_dir(root, ino, path);
 		/* Not a directory or we got an error. */
 		if (ret <= 0)
-			return ret;
+			goto out;
 
 		/* Conflicting inode is a directory, so we'll log its parent. */
 		ino_elem = kmalloc_obj(*ino_elem, GFP_NOFS);
-		if (!ino_elem)
-			return -ENOMEM;
+		if (!ino_elem) {
+			ret = -ENOMEM;
+			goto out;
+		}
 		ino_elem->ino = ino;
 		ino_elem->parent = parent;
 		list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
 		ctx->num_conflict_inodes++;
-
-		return 0;
+		ret = 0;
+		goto out;
 	}
 
 	/*
@@ -6218,25 +6234,31 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
 	 */
 	if (!need_log_inode(trans, inode)) {
 		btrfs_add_delayed_iput(inode);
-		return 0;
+		goto out;
 	}
 
 	if (!can_log_conflicting_inode(trans, inode)) {
 		btrfs_add_delayed_iput(inode);
-		return BTRFS_LOG_FORCE_COMMIT;
+		ret = BTRFS_LOG_FORCE_COMMIT;
+		goto out;
 	}
 
 	btrfs_add_delayed_iput(inode);
 
 	ino_elem = kmalloc_obj(*ino_elem, GFP_NOFS);
-	if (!ino_elem)
-		return -ENOMEM;
+	if (!ino_elem) {
+		ret = -ENOMEM;
+		goto out;
+	}
 	ino_elem->ino = ino;
 	ino_elem->parent = parent;
 	list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
 	ctx->num_conflict_inodes++;
 
-	return 0;
+out:
+	trace_btrfs_add_conflicting_inode_exit(trans, ctx, ino, parent, ret);
+
+	return ret;
 }
 
 static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
@@ -6254,7 +6276,15 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
 	if (ctx->logging_conflict_inodes)
 		return 0;
 
+	/*
+	 * Avoid any work if no conflicting inodes and emitting the trace event
+	 * which only adds noise and it's useless if there are no inodes.
+	 */
+	if (list_empty(&ctx->conflict_inodes))
+		return 0;
+
 	ctx->logging_conflict_inodes = true;
+	trace_btrfs_log_conflicting_inodes_enter(trans, ctx);
 
 	/*
 	 * New conflicting inodes may be found and added to the list while we
@@ -6348,6 +6378,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
 	ctx->logging_conflict_inodes = false;
 	if (ret)
 		free_conflicting_inodes(ctx);
+	trace_btrfs_log_conflicting_inodes_exit(trans, ctx, ret);
 
 	return ret;
 }
@@ -6359,7 +6390,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
 				   struct btrfs_path *path,
 				   struct btrfs_path *dst_path,
 				   const u64 logged_isize,
-				   const int inode_only,
+				   const enum btrfs_log_mode log_mode,
 				   struct btrfs_log_ctx *ctx,
 				   bool *need_log_inode_item)
 {
@@ -6415,7 +6446,7 @@ again:
 				}
 				ret = copy_items(trans, inode, dst_path, path,
 						 ins_start_slot, ins_nr,
-						 inode_only, logged_isize, ctx);
+						 log_mode, logged_isize, ctx);
 				if (ret < 0)
 					return ret;
 				ins_nr = 0;
@@ -6434,7 +6465,7 @@ again:
 				goto next_slot;
 			ret = copy_items(trans, inode, dst_path, path,
 					 ins_start_slot,
-					 ins_nr, inode_only, logged_isize, ctx);
+					 ins_nr, log_mode, logged_isize, ctx);
 			if (ret < 0)
 				return ret;
 			ins_nr = 0;
@@ -6451,7 +6482,7 @@ again:
 		}
 
 		ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
-				 ins_nr, inode_only, logged_isize, ctx);
+				 ins_nr, log_mode, logged_isize, ctx);
 		if (ret < 0)
 			return ret;
 		ins_nr = 1;
@@ -6465,7 +6496,7 @@ next_slot:
 		}
 		if (ins_nr) {
 			ret = copy_items(trans, inode, dst_path, path,
-					 ins_start_slot, ins_nr, inode_only,
+					 ins_start_slot, ins_nr, log_mode,
 					 logged_isize, ctx);
 			if (ret < 0)
 				return ret;
@@ -6491,12 +6522,12 @@ next_key:
 	}
 	if (ins_nr) {
 		ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
-				 ins_nr, inode_only, logged_isize, ctx);
+				 ins_nr, log_mode, logged_isize, ctx);
 		if (ret)
 			return ret;
 	}
 
-	if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
+	if (log_mode == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
 		/*
 		 * Release the path because otherwise we might attempt to double
 		 * lock the same leaf with btrfs_log_prealloc_extents() below.
@@ -6827,8 +6858,16 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
 	 */
 	lockdep_assert_not_held(&inode->log_mutex);
 
-	ASSERT(!ctx->logging_new_delayed_dentries,
-	       "ctx->logging_new_delayed_dentries=%d", ctx->logging_new_delayed_dentries);
+	ASSERT(!ctx->logging_new_delayed_dentries);
+
+	/*
+	 * Return early if empty list, avoid emitting redundant trace events
+	 * that generate noise only.
+	 */
+	if (list_empty(delayed_ins_list))
+		return 0;
+
+	trace_btrfs_log_new_delayed_dentries_enter(trans, inode);
 	ctx->logging_new_delayed_dentries = true;
 
 	list_for_each_entry(item, delayed_ins_list, log_list) {
@@ -6871,6 +6910,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
 
 	ctx->log_new_dentries = orig_log_new_dentries;
 	ctx->logging_new_delayed_dentries = false;
+	trace_btrfs_log_new_delayed_dentries_exit(trans, inode, ret);
 
 	return ret;
 }
@@ -6891,11 +6931,11 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
  */
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 			   struct btrfs_inode *inode,
-			   int inode_only,
+			   enum btrfs_log_mode log_mode,
 			   struct btrfs_log_ctx *ctx)
 {
 	struct btrfs_path *path;
-	struct btrfs_path *dst_path;
+	struct btrfs_path *dst_path = NULL;
 	struct btrfs_key min_key;
 	struct btrfs_key max_key;
 	struct btrfs_root *log = inode->root->log_root;
@@ -6911,13 +6951,17 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	LIST_HEAD(delayed_ins_list);
 	LIST_HEAD(delayed_del_list);
 
+	trace_btrfs_log_inode_enter(trans, inode, ctx, log_mode);
+
 	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
 	dst_path = btrfs_alloc_path();
 	if (!dst_path) {
-		btrfs_free_path(path);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
 	}
 
 	min_key.objectid = ino;
@@ -6931,13 +6975,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	if (S_ISDIR(inode->vfs_inode.i_mode) ||
 	    (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 		       &inode->runtime_flags) &&
-	     inode_only >= LOG_INODE_EXISTS))
+	     log_mode >= LOG_INODE_EXISTS))
 		max_key.type = BTRFS_XATTR_ITEM_KEY;
 	else
 		max_key.type = (u8)-1;
 	max_key.offset = (u64)-1;
 
-	if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL)
+	if (S_ISDIR(inode->vfs_inode.i_mode) && log_mode == LOG_INODE_ALL)
 		full_dir_logging = true;
 
 	/*
@@ -6988,7 +7032,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	 * for symlinks).
 	 */
 	if (S_ISLNK(inode->vfs_inode.i_mode))
-		inode_only = LOG_INODE_ALL;
+		log_mode = LOG_INODE_ALL;
 
 	/*
 	 * Before logging the inode item, cache the value returned by
@@ -7023,7 +7067,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 			ret = drop_inode_items(trans, log, path, inode,
 					       BTRFS_XATTR_ITEM_KEY);
 	} else {
-		if (inode_only == LOG_INODE_EXISTS) {
+		if (log_mode == LOG_INODE_EXISTS) {
 			/*
 			 * Make sure the new inode item we write to the log has
 			 * the same isize as the current one (if it exists).
@@ -7043,7 +7087,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 		}
 		if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 			     &inode->runtime_flags)) {
-			if (inode_only == LOG_INODE_EXISTS) {
+			if (log_mode == LOG_INODE_EXISTS) {
 				max_key.type = BTRFS_XATTR_ITEM_KEY;
 				if (ctx->logged_before)
 					ret = drop_inode_items(trans, log, path,
@@ -7059,15 +7103,15 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 			}
 		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
 					      &inode->runtime_flags) ||
-			   inode_only == LOG_INODE_EXISTS) {
-			if (inode_only == LOG_INODE_ALL)
+			   log_mode == LOG_INODE_EXISTS) {
+			if (log_mode == LOG_INODE_ALL)
 				fast_search = true;
 			max_key.type = BTRFS_XATTR_ITEM_KEY;
 			if (ctx->logged_before)
 				ret = drop_inode_items(trans, log, path, inode,
 						       max_key.type);
 		} else {
-			if (inode_only == LOG_INODE_ALL)
+			if (log_mode == LOG_INODE_ALL)
 				fast_search = true;
 			inode_item_dropped = false;
 			goto log_extents;
@@ -7102,8 +7146,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
 	ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
 				      path, dst_path, logged_isize,
-				      inode_only, ctx,
-				      &need_log_inode_item);
+				      log_mode, ctx, &need_log_inode_item);
 	if (ret)
 		goto out_unlock;
 
@@ -7146,7 +7189,7 @@ log_extents:
 		ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
 		if (ret)
 			goto out_unlock;
-	} else if (inode_only == LOG_INODE_ALL) {
+	} else if (log_mode == LOG_INODE_ALL) {
 		struct extent_map *em, *n;
 
 		write_lock(&em_tree->lock);
@@ -7202,7 +7245,7 @@ log_extents:
 	 *    a power failure unless the log was synced as part of an fsync
 	 *    against any other unrelated inode.
 	 */
-	if (!ctx->logging_new_name && inode_only != LOG_INODE_EXISTS)
+	if (!ctx->logging_new_name && log_mode != LOG_INODE_EXISTS)
 		inode->last_log_commit = inode->last_sub_trans;
 	spin_unlock(&inode->lock);
 
@@ -7210,7 +7253,7 @@ log_extents:
 	 * Reset the last_reflink_trans so that the next fsync does not need to
 	 * go through the slower path when logging extents and their checksums.
 	 */
-	if (inode_only == LOG_INODE_ALL)
+	if (log_mode == LOG_INODE_ALL)
 		inode->last_reflink_trans = 0;
 
 out_unlock:
@@ -7233,6 +7276,8 @@ out:
 					    &delayed_del_list);
 	}
 
+	trace_btrfs_log_inode_exit(trans, inode, ret);
+
 	return ret;
 }
 
@@ -7246,9 +7291,13 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
 	struct btrfs_root *root = inode->root;
 	const u64 ino = btrfs_ino(inode);
 
+	trace_btrfs_log_all_parents_enter(trans, inode);
+
 	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
 	path->skip_locking = true;
 	path->search_commit_root = true;
 
@@ -7257,7 +7306,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
 	key.offset = 0;
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
-		return ret;
+		goto out;
 
 	while (true) {
 		struct extent_buffer *leaf = path->nodes[0];
@@ -7269,9 +7318,11 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
 		if (slot >= btrfs_header_nritems(leaf)) {
 			ret = btrfs_next_leaf(root, path);
 			if (ret < 0)
-				return ret;
-			if (ret > 0)
+				goto out;
+			if (ret > 0) {
+				ret = 0;
 				break;
+			}
 			continue;
 		}
 
@@ -7324,8 +7375,10 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
 			 * at both parents and the old parent B would still
 			 * exist.
 			 */
-			if (IS_ERR(dir_inode))
-				return PTR_ERR(dir_inode);
+			if (IS_ERR(dir_inode)) {
+				ret = PTR_ERR(dir_inode);
+				goto out;
+			}
 
 			if (!need_log_inode(trans, dir_inode)) {
 				btrfs_add_delayed_iput(dir_inode);
@@ -7338,11 +7391,14 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
 				ret = log_new_dir_dentries(trans, dir_inode, ctx);
 			btrfs_add_delayed_iput(dir_inode);
 			if (ret)
-				return ret;
+				goto out;
 		}
 		path->slots[0]++;
 	}
-	return 0;
+out:
+	trace_btrfs_log_all_parents_exit(trans, inode, ret);
+
+	return ret;
 }
 
 static int log_new_ancestors(struct btrfs_trans_handle *trans,
@@ -7457,16 +7513,22 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
 	struct btrfs_key search_key;
 	int ret;
 
+	trace_btrfs_log_all_new_ancestors_enter(trans, inode);
+
 	/*
 	 * For a single hard link case, go through a fast path that does not
 	 * need to iterate the fs/subvolume tree.
 	 */
-	if (inode->vfs_inode.i_nlink < 2)
-		return log_new_ancestors_fast(trans, inode, parent, ctx);
+	if (inode->vfs_inode.i_nlink < 2) {
+		ret = log_new_ancestors_fast(trans, inode, parent, ctx);
+		goto out;
+	}
 
 	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	search_key.objectid = ino;
 	search_key.type = BTRFS_INODE_REF_KEY;
@@ -7474,7 +7536,7 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
 again:
 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
 	if (ret < 0)
-		return ret;
+		goto out;
 	if (ret == 0)
 		path->slots[0]++;
 
@@ -7486,9 +7548,11 @@ again:
 		if (slot >= btrfs_header_nritems(leaf)) {
 			ret = btrfs_next_leaf(root, path);
 			if (ret < 0)
-				return ret;
-			if (ret > 0)
+				goto out;
+			if (ret > 0) {
+				ret = 0;
 				break;
+			}
 			continue;
 		}
 
@@ -7504,8 +7568,10 @@ again:
 		 * this loop, etc). So just return some error to fallback to
 		 * a transaction commit.
 		 */
-		if (found_key.type == BTRFS_INODE_EXTREF_KEY)
-			return -EMLINK;
+		if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
+			ret = -EMLINK;
+			goto out;
+		}
 
 		/*
 		 * Logging ancestors needs to do more searches on the fs/subvol
@@ -7517,11 +7583,13 @@ again:
 
 		ret = log_new_ancestors(trans, root, path, ctx);
 		if (ret)
-			return ret;
+			goto out;
 		btrfs_release_path(path);
 		goto again;
 	}
-	return 0;
+out:
+	trace_btrfs_log_all_new_ancestors_exit(trans, inode, ret);
+	return ret;
 }
 
 /*
@@ -7533,7 +7601,7 @@ again:
 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 				  struct btrfs_inode *inode,
 				  struct dentry *parent,
-				  int inode_only,
+				  enum btrfs_log_mode log_mode,
 				  struct btrfs_log_ctx *ctx)
 {
 	struct btrfs_root *root = inode->root;
@@ -7541,29 +7609,39 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 	int ret = 0;
 	bool log_dentries;
 
-	if (btrfs_test_opt(fs_info, NOTREELOG))
-		return BTRFS_LOG_FORCE_COMMIT;
+	trace_btrfs_log_inode_parent_enter(trans, inode);
+
+	if (btrfs_test_opt(fs_info, NOTREELOG)) {
+		ret = BTRFS_LOG_FORCE_COMMIT;
+		goto out;
+	}
 
-	if (btrfs_root_refs(&root->root_item) == 0)
-		return BTRFS_LOG_FORCE_COMMIT;
+	if (btrfs_root_refs(&root->root_item) == 0) {
+		ret = BTRFS_LOG_FORCE_COMMIT;
+		goto out;
+	}
 
 	/*
 	 * If we're logging an inode from a subvolume created in the current
 	 * transaction we must force a commit since the root is not persisted.
 	 */
-	if (btrfs_root_generation(&root->root_item) == trans->transid)
-		return BTRFS_LOG_FORCE_COMMIT;
+	if (btrfs_root_generation(&root->root_item) == trans->transid) {
+		ret = BTRFS_LOG_FORCE_COMMIT;
+		goto out;
+	}
 
 	/* Skip already logged inodes and without new extents. */
 	if (btrfs_inode_in_log(inode, trans->transid) &&
-	    list_empty(&ctx->ordered_extents))
-		return BTRFS_NO_LOG_SYNC;
+	    list_empty(&ctx->ordered_extents)) {
+		ret = BTRFS_NO_LOG_SYNC;
+		goto out;
+	}
 
 	ret = start_log_trans(trans, root, ctx);
 	if (ret)
-		return ret;
+		goto out;
 
-	ret = btrfs_log_inode(trans, inode, inode_only, ctx);
+	ret = btrfs_log_inode(trans, inode, log_mode, ctx);
 	if (ret)
 		goto end_trans;
 
@@ -7649,6 +7727,9 @@ end_trans:
 		btrfs_remove_log_ctx(root, ctx);
 	btrfs_end_log_trans(root);
 
+out:
+	trace_btrfs_log_inode_parent_exit(trans, inode, ret);
+
 	return ret;
 }
 
@@ -7872,6 +7953,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
 			     struct btrfs_inode *dir, struct btrfs_inode *inode,
 			     bool for_rename)
 {
+	trace_btrfs_record_unlink_dir(trans, dir, inode, for_rename);
+
 	/*
 	 * when we're logging a file, if it hasn't been renamed
 	 * or unlinked, and its inode is fully committed on disk,
@@ -7934,6 +8017,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
 				   struct btrfs_inode *dir)
 {
+	trace_btrfs_record_snapshot_destroy(trans, dir);
+
 	mutex_lock(&dir->log_mutex);
 	dir->last_unlink_trans = trans->transid;
 	mutex_unlock(&dir->log_mutex);
@@ -7954,6 +8039,8 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
 void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
 				struct btrfs_inode *dir)
 {
+	trace_btrfs_record_new_subvolume(trans, dir);
+
 	mutex_lock(&dir->log_mutex);
 	dir->last_unlink_trans = trans->transid;
 	mutex_unlock(&dir->log_mutex);
@@ -7986,6 +8073,8 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
 	bool log_pinned = false;
 	int ret;
 
+	trace_btrfs_log_new_name_enter(trans, inode, old_dir, old_dir_index);
+
 	/* The inode has a new name (ref/extref), so make sure we log it. */
 	set_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
 
@@ -8008,7 +8097,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
 		goto out;
 	} else if (ret == 0) {
 		if (!old_dir)
-			return;
+			goto out;
 		/*
 		 * If the inode was not logged and we are doing a rename (old_dir is not
 		 * NULL), check if old_dir was logged - if it was not we can return and
@@ -8018,7 +8107,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
 		if (ret < 0)
 			goto out;
 		else if (ret == 0)
-			return;
+			goto out;
 	}
 	ret = 0;
 
@@ -8117,6 +8206,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
 	btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
 	ASSERT(list_empty(&ctx.conflict_inodes));
 out:
+	trace_btrfs_log_new_name_exit(trans, inode, old_dir, ret);
 	/*
 	 * If an error happened mark the log for a full commit because it's not
 	 * consistent and up to date or we couldn't find out if one of the
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 4a626dc6a58b..81ab5eeeb974 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -11,6 +11,13 @@
 #include <linux/fscrypt.h>
 #include "transaction.h"
 
+enum btrfs_log_mode {
+	/* Log everything about an inode. */
+	LOG_INODE_ALL,
+	/* Log just enough to recreate the inode during log replay. */
+	LOG_INODE_EXISTS,
+};
+
 struct inode;
 struct dentry;
 struct btrfs_ordered_extent;
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 0062b3a55781..983365a73541 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -458,7 +458,7 @@ static int rollback_verity(struct btrfs_inode *inode)
 	if (ret) {
 		btrfs_handle_fs_error(root->fs_info, ret,
 				"failed to drop verity items in rollback %llu",
-				(u64)inode->vfs_inode.i_ino);
+				inode->vfs_inode.i_ino);
 		goto out;
 	}
 
@@ -472,7 +472,7 @@ static int rollback_verity(struct btrfs_inode *inode)
 		trans = NULL;
 		btrfs_handle_fs_error(root->fs_info, ret,
 			"failed to start transaction in verity rollback %llu",
-			(u64)inode->vfs_inode.i_ino);
+			inode->vfs_inode.i_ino);
 		goto out;
 	}
 	inode->ro_flags &= ~BTRFS_INODE_RO_VERITY;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a88e68f90564..6eab4cc73ce4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -963,6 +963,11 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 					  devid, btrfs_dev_name(device),
 					  path, current->comm,
 					  task_pid_nr(current));
+		} else {
+			btrfs_info(NULL,
+	"missing devid %llu re-appeared at %s scanned by %s (%d)",
+				   devid, path, current->comm,
+				   task_pid_nr(current));
 		}
 
 		name = kstrdup(path, GFP_NOFS);
@@ -1369,9 +1374,9 @@ struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
 				      (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT);
 	}
 
-	filemap_invalidate_lock(mapping);
+	filemap_invalidate_lock_shared(mapping);
 	page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
-	filemap_invalidate_unlock(mapping);
+	filemap_invalidate_unlock_shared(mapping);
 	if (IS_ERR(page))
 		return ERR_CAST(page);
 
@@ -2286,6 +2291,38 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_devic
 	update_dev_time(rcu_dereference_raw(device->name));
 }
 
+int btrfs_remove_dev_stat_item(struct btrfs_trans_handle *trans, u64 devid)
+{
+	BTRFS_PATH_AUTO_RELEASE(path);
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_root *dev_root = fs_info->dev_root;
+	struct btrfs_key key;
+	int ret;
+
+	key.objectid = BTRFS_DEV_STATS_OBJECTID;
+	key.type = BTRFS_PERSISTENT_ITEM_KEY;
+	key.offset = devid;
+
+	ret = btrfs_search_slot(trans, dev_root, &key, &path, -1, 1);
+	if (ret < 0) {
+		btrfs_warn(fs_info,
+			   "error %d while searching for dev_stats item for devid %llu",
+			   ret, devid);
+		return ret;
+	}
+	/* The dev stats item does not exist, nothing to bother. */
+	if (ret > 0)
+		return 0;
+	ret = btrfs_del_item(trans, dev_root, &path);
+	if (ret < 0) {
+		btrfs_warn(fs_info,
+			   "error %d while deleting dev_stats item for devid %llu",
+			   ret, devid);
+		return ret;
+	}
+	return 0;
+}
+
 int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 		    struct btrfs_dev_lookup_args *args,
 		    struct file **bdev_file)
@@ -2365,6 +2402,12 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 		return ret;
 	}
 
+	ret = btrfs_remove_dev_stat_item(trans, device->devid);
+	if (unlikely(ret)) {
+		btrfs_abort_transaction(trans, ret);
+		btrfs_end_transaction(trans);
+		return ret;
+	}
 	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 	btrfs_scrub_cancel_dev(device);
 
@@ -2889,6 +2932,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	device->commit_total_bytes = device->total_bytes;
 	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
+
+	/*
+	 * Increase dev_stats_ccnt so that corresponding DEV_STATS item can be
+	 * created at the next transaction commit.
+	 */
+	atomic_inc(&device->dev_stats_ccnt);
 	device->dev_stats_valid = 1;
 	set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE);
 
@@ -3718,7 +3767,11 @@ static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
 	u64 chunk_type;
 
 	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
-	ASSERT(cache);
+	if (unlikely(!cache)) {
+		btrfs_err(fs_info, "balance: chunk at bytenr %llu has no corresponding block group",
+			  chunk_offset);
+		return -EUCLEAN;
+	}
 	chunk_type = cache->flags;
 	btrfs_put_block_group(cache);
 
@@ -3957,16 +4010,21 @@ static bool chunk_profiles_filter(u64 chunk_type, struct btrfs_balance_args *bar
 	return true;
 }
 
-static bool chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
-				     struct btrfs_balance_args *bargs)
+static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+				    struct btrfs_balance_args *bargs)
 {
 	struct btrfs_block_group *cache;
 	u64 chunk_used;
 	u64 user_thresh_min;
 	u64 user_thresh_max;
-	bool ret = true;
+	int ret = 1;
 
 	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+	if (unlikely(!cache)) {
+		btrfs_err(fs_info, "balance: chunk at bytenr %llu has no corresponding block group",
+			  chunk_offset);
+		return -EUCLEAN;
+	}
 	chunk_used = cache->used;
 
 	if (bargs->usage_min == 0)
@@ -3982,20 +4040,25 @@ static bool chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_of
 		user_thresh_max = mult_perc(cache->length, bargs->usage_max);
 
 	if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
-		ret = false;
+		ret = 0;
 
 	btrfs_put_block_group(cache);
 	return ret;
 }
 
-static bool chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
-			       struct btrfs_balance_args *bargs)
+static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+			      struct btrfs_balance_args *bargs)
 {
 	struct btrfs_block_group *cache;
 	u64 chunk_used, user_thresh;
-	bool ret = true;
+	int ret = 1;
 
 	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+	if (unlikely(!cache)) {
+		btrfs_err(fs_info, "balance: chunk at bytenr %llu has no corresponding block group",
+			  chunk_offset);
+		return -EUCLEAN;
+	}
 	chunk_used = cache->used;
 
 	if (bargs->usage_min == 0)
@@ -4006,7 +4069,7 @@ static bool chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
 		user_thresh = mult_perc(cache->length, bargs->usage);
 
 	if (chunk_used < user_thresh)
-		ret = false;
+		ret = 0;
 
 	btrfs_put_block_group(cache);
 	return ret;
@@ -4111,8 +4174,8 @@ static bool chunk_soft_convert_filter(u64 chunk_type, struct btrfs_balance_args
 	return false;
 }
 
-static bool should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk *chunk,
-				 u64 chunk_offset)
+static int should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk *chunk,
+				u64 chunk_offset)
 {
 	struct btrfs_fs_info *fs_info = leaf->fs_info;
 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
@@ -4145,12 +4208,22 @@ static bool should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk
 	}
 
 	/* usage filter */
-	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
-	    chunk_usage_filter(fs_info, chunk_offset, bargs)) {
-		return false;
-	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
-	    chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
-		return false;
+	if (bargs->flags & BTRFS_BALANCE_ARGS_USAGE) {
+		int ret2;
+
+		ret2 = chunk_usage_filter(fs_info, chunk_offset, bargs);
+		if (ret2 < 0)
+			return ret2;
+		if (ret2)
+			return false;
+	} else if (bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) {
+		int ret2;
+
+		ret2 = chunk_usage_range_filter(fs_info, chunk_offset, bargs);
+		if (ret2 < 0)
+			return ret2;
+		if (ret2)
+			return false;
 	}
 
 	/* devid filter */
@@ -4337,7 +4410,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 	u32 count_data = 0;
 	u32 count_meta = 0;
 	u32 count_sys = 0;
-	int chunk_reserved = 0;
+	bool chunk_reserved = false;
 	struct remap_chunk_info *rci;
 	unsigned int num_remap_chunks = 0;
 	LIST_HEAD(remap_chunks);
@@ -4430,6 +4503,10 @@ again:
 		ret = should_balance_chunk(leaf, chunk, found_key.offset);
 
 		btrfs_release_path(path);
+		if (ret < 0) {
+			mutex_unlock(&fs_info->reclaim_bgs_lock);
+			goto error;
+		}
 		if (!ret) {
 			mutex_unlock(&fs_info->reclaim_bgs_lock);
 			goto loop;
@@ -4502,7 +4579,7 @@ again:
 				mutex_unlock(&fs_info->reclaim_bgs_lock);
 				goto error;
 			} else if (ret == 1) {
-				chunk_reserved = 1;
+				chunk_reserved = true;
 			}
 		}
 
@@ -4763,7 +4840,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
 {
 	u64 meta_target, data_target;
 	u64 allowed;
-	int mixed = 0;
+	bool mixed = false;
 	int ret;
 	u64 num_devices;
 	unsigned seq;
@@ -4780,7 +4857,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
 
 	allowed = btrfs_super_incompat_flags(fs_info->super_copy);
 	if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
-		mixed = 1;
+		mixed = true;
 
 	/*
 	 * In case of mixed groups both data and meta should be picked,
@@ -6053,7 +6130,7 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
 
 	lockdep_assert_held(&info->chunk_mutex);
 
-	if (!alloc_profile_is_valid(type, 0)) {
+	if (unlikely(!alloc_profile_is_valid(type, 0))) {
 		DEBUG_WARN("invalid alloc profile for type %llu", type);
 		return ERR_PTR(-EINVAL);
 	}
@@ -6064,7 +6141,7 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
 		return ERR_PTR(-ENOSPC);
 	}
 
-	if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+	if (unlikely(!(type & BTRFS_BLOCK_GROUP_TYPE_MASK))) {
 		btrfs_err(info, "invalid chunk type 0x%llx requested", type);
 		DEBUG_WARN();
 		return ERR_PTR(-EINVAL);
@@ -6234,7 +6311,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
 
 	alloc_profile = btrfs_metadata_alloc_profile(fs_info);
 	meta_space_info = btrfs_find_space_info(fs_info, alloc_profile);
-	if (!meta_space_info) {
+	if (unlikely(!meta_space_info)) {
 		DEBUG_WARN();
 		return -EINVAL;
 	}
@@ -6244,7 +6321,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
 
 	alloc_profile = btrfs_system_alloc_profile(fs_info);
 	sys_space_info = btrfs_find_space_info(fs_info, alloc_profile);
-	if (!sys_space_info) {
+	if (unlikely(!sys_space_info)) {
 		DEBUG_WARN();
 		return -EINVAL;
 	}
@@ -8137,8 +8214,8 @@ static int btrfs_device_init_dev_stats(struct btrfs_device *device,
 
 	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
 		if (item_size >= (1 + i) * sizeof(__le64))
-			btrfs_dev_stat_set(device, i,
-					   btrfs_dev_stats_value(eb, ptr, i));
+			atomic_set(device->dev_stat_values + i,
+				   btrfs_dev_stats_value(eb, ptr, i));
 		else
 			btrfs_dev_stat_set(device, i, 0);
 	}
@@ -8179,6 +8256,37 @@ out:
 	return ret;
 }
 
+int btrfs_init_writeback_bio_size(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct btrfs_device *device;
+	u32 writeback_bio_size = fs_info->sectorsize;
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	/*
+	 * Let's take maximum over optimal request sizes for all devices. For
+	 * RAID profiles writeback will submit stripe (64k) sized bios anyway
+	 * so our value doesn't matter and for simple profiles this is a good
+	 * approximation of sensible IO chunking.
+	 */
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+		struct request_queue *queue;
+		unsigned int io_opt;
+
+		if (!device->bdev || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+			continue;
+		queue = bdev_get_queue(device->bdev);
+		io_opt = queue_io_opt(queue) ? :
+				queue_max_sectors(queue) << SECTOR_SHIFT;
+		writeback_bio_size = max(writeback_bio_size, io_opt);
+	}
+	mutex_unlock(&fs_devices->device_list_mutex);
+
+	fs_info->writeback_bio_size = writeback_bio_size;
+
+	return 0;
+}
+
 static int update_dev_stat_item(struct btrfs_trans_handle *trans,
 				struct btrfs_device *device)
 {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 0082c166af91..63be45c3298c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -784,6 +784,7 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
 			struct btrfs_ioctl_get_dev_stats *stats);
 int btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
+int btrfs_init_writeback_bio_size(struct btrfs_fs_info *fs_info);
 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans);
 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev);
 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev);
@@ -932,6 +933,7 @@ bool btrfs_first_pending_extent(struct btrfs_device *device, u64 start, u64 len,
 				u64 *pending_start, u64 *pending_end);
 bool btrfs_find_hole_in_pending_extents(struct btrfs_device *device,
 					u64 *start, u64 *len, u64 min_hole_size);
+int btrfs_remove_dev_stat_item(struct btrfs_trans_handle *trans, u64 devid);
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 16dd87aa06f2..97f06dd01693 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -131,8 +131,10 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
 			u64 bytenr = ALIGN_DOWN(zone_end, BTRFS_SUPER_INFO_SIZE) -
 						BTRFS_SUPER_INFO_SIZE;
 
+			filemap_invalidate_lock_shared(mapping);
 			page[i] = read_cache_page_gfp(mapping,
 					bytenr >> PAGE_SHIFT, GFP_NOFS);
+			filemap_invalidate_unlock_shared(mapping);
 			if (IS_ERR(page[i])) {
 				if (i == 1)
 					btrfs_release_disk_super(super[0]);
@@ -354,12 +356,33 @@ int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
 	return ret;
 }
 
+static int btrfs_get_max_active_zones(struct btrfs_device *device,
+				      struct btrfs_zoned_device_info *zone_info)
+{
+	struct block_device *bdev = device->bdev;
+	int max_active_zones;
+
+	if (unlikely(zone_info->nr_zones < BTRFS_MIN_ACTIVE_ZONES)) {
+		btrfs_err(device->fs_info, "zoned: not enough zones to mount filesystem: %u < %d",
+			  zone_info->nr_zones, BTRFS_MIN_ACTIVE_ZONES);
+		return -EINVAL;
+	}
+
+	max_active_zones = min_not_zero(bdev_max_active_zones(bdev),
+					bdev_max_open_zones(bdev));
+	if (max_active_zones == 0)
+		max_active_zones = min(zone_info->nr_zones / 4,
+				       BTRFS_DEFAULT_MAX_ACTIVE_ZONES);
+
+	zone_info->max_active_zones = max(max_active_zones, BTRFS_MIN_ACTIVE_ZONES);
+	return 0;
+}
+
 int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 {
 	struct btrfs_fs_info *fs_info = device->fs_info;
 	struct btrfs_zoned_device_info *zone_info = NULL;
 	struct block_device *bdev = device->bdev;
-	unsigned int max_active_zones;
 	unsigned int nactive;
 	sector_t nr_sectors;
 	sector_t sector = 0;
@@ -424,19 +447,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 	if (!IS_ALIGNED(nr_sectors, zone_sectors))
 		zone_info->nr_zones++;
 
-	max_active_zones = min_not_zero(bdev_max_active_zones(bdev),
-					bdev_max_open_zones(bdev));
-	if (!max_active_zones && zone_info->nr_zones > BTRFS_DEFAULT_MAX_ACTIVE_ZONES)
-		max_active_zones = BTRFS_DEFAULT_MAX_ACTIVE_ZONES;
-	if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
-		btrfs_err(fs_info,
-"zoned: %s: max active zones %u is too small, need at least %u active zones",
-				 rcu_dereference(device->name), max_active_zones,
-				 BTRFS_MIN_ACTIVE_ZONES);
-		ret = -EINVAL;
+	ret = btrfs_get_max_active_zones(device, zone_info);
+	if (ret)
 		goto out;
-	}
-	zone_info->max_active_zones = max_active_zones;
 
 	zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
 	if (!zone_info->seq_zones) {
@@ -517,26 +530,29 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 		goto out;
 	}
 
-	if (max_active_zones) {
-		if (unlikely(nactive > max_active_zones)) {
-			if (bdev_max_active_zones(bdev) == 0) {
-				max_active_zones = 0;
-				zone_info->max_active_zones = 0;
-				goto validate;
-			}
+	if (unlikely(nactive > zone_info->max_active_zones)) {
+		if (bdev_max_active_zones(bdev) > 0) {
 			btrfs_err(device->fs_info,
-			"zoned: %u active zones on %s exceeds max_active_zones %u",
-					 nactive, rcu_dereference(device->name),
-					 max_active_zones);
+					"zoned: %u active zones on %s exceeds max_active_zones %u",
+					nactive, rcu_dereference(device->name),
+					zone_info->max_active_zones);
 			ret = -EIO;
 			goto out;
 		}
+
+		/*
+		 * This is for backward compatibility with old filesystems that
+		 * have a lot of active zones because the device doesn't report
+		 * a maximum number of zones and we previously didn't care for
+		 * the limit.
+		 */
+		zone_info->max_active_zones = 0;
+	} else {
 		atomic_set(&zone_info->active_zones_left,
-			   max_active_zones - nactive);
+				zone_info->max_active_zones - nactive);
 		set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags);
 	}
 
-validate:
 	/* Validate superblock log */
 	nr_zones = BTRFS_NR_SB_LOG_ZONES;
 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
@@ -1311,7 +1327,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
 {
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 	struct btrfs_device *device;
-	int dev_replace_is_ongoing = 0;
+	bool dev_replace_is_ongoing = false;
 	unsigned int nofs_flag;
 	struct blk_zone zone;
 	int ret;
@@ -2763,7 +2779,6 @@ void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)
 	struct btrfs_block_group *bg;
 	struct list_head *bg_list;
 	u64 alloc_flags;
-	bool first = true;
 	bool did_chunk_alloc = false;
 	int index;
 	int ret;
@@ -2780,17 +2795,12 @@ void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)
 	alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags);
 	index = btrfs_bg_flags_to_raid_index(alloc_flags);
 
-	/* Scan the data space_info to find empty block groups. Take the second one. */
 again:
 	bg_list = &space_info->block_groups[index];
 	list_for_each_entry(bg, bg_list, list) {
-		if (bg->alloc_offset != 0)
-			continue;
 
-		if (first) {
-			first = false;
+		if (bg->alloc_offset != 0)
 			continue;
-		}
 
 		if (space_info == data_sinfo) {
 			/* Migrate the block group to the data relocation space_info. */
@@ -2803,8 +2813,6 @@ again:
 
 			down_write(&space_info->groups_sem);
 			list_del_init(&bg->list);
-			/* We can assume this as we choose the second empty one. */
-			ASSERT(!list_empty(&space_info->block_groups[index]));
 			up_write(&space_info->groups_sem);
 
 			spin_lock(&space_info->lock);
@@ -2849,7 +2857,6 @@ again:
 		 * We allocated a new block group in the data relocation space_info. We
 		 * can take that one.
 		 */
-		first = false;
 		did_chunk_alloc = true;
 		goto again;
 	}
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index ec1df8b94517..4c5c47c5edb7 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -31,6 +31,19 @@ struct btrfs_space_info;
 struct btrfs_raid_bio;
 struct raid56_bio_trace_info;
 struct find_free_extent_ctl;
+struct btrfs_trans_handle;
+struct btrfs_transaction;
+struct btrfs_log_ctx;
+
+#define show_inode_type(mode)					\
+	__print_symbolic((mode) & S_IFMT,			\
+		 { S_IFDIR,  "DIR" },	          		\
+		 { S_IFREG,  "REG" },	          		\
+		 { S_IFLNK,  "LNK" },	          		\
+		 { S_IFIFO,  "FIFO" },	          		\
+		 { S_IFCHR,  "CHR" },		  		\
+		 { S_IFBLK,  "BLK" },	          		\
+		 { S_IFSOCK, "SOCK" })
 
 #define show_ref_type(type)						\
 	__print_symbolic(type,						\
@@ -101,8 +114,21 @@ struct find_free_extent_ctl;
 	EM( ALLOC_CHUNK_FORCE,		"ALLOC_CHUNK_FORCE")		\
 	EM( RUN_DELAYED_IPUTS,		"RUN_DELAYED_IPUTS")		\
 	EM( COMMIT_TRANS,		"COMMIT_TRANS")			\
+	EM( RECLAIM_ZONES,		"RECLAIM_ZONES")		\
 	EMe(RESET_ZONES,		"RESET_ZONES")
 
+#define TRANSACTION_STATES							\
+	EM( TRANS_STATE_RUNNING,		"TRANS_STATE_RUNNING")		\
+	EM( TRANS_STATE_COMMIT_PREP,		"TRANS_STATE_COMMIT_PREP")	\
+	EM( TRANS_STATE_COMMIT_START,		"TRANS_STATE_COMMIT_START")	\
+	EM( TRANS_STATE_COMMIT_DOING,		"TRANS_STATE_COMMIT_DOING")	\
+	EM( TRANS_STATE_UNBLOCKED,		"TRANS_STATE_UNBLOCKED")	\
+	EM( TRANS_STATE_SUPER_COMMITTED,	"TRANS_STATE_SUPER_COMMITTED")	\
+	EMe(TRANS_STATE_COMPLETED,		"TRANS_STATE_COMPLETED")
+
+#define LOG_MODES							\
+	EM( LOG_INODE_ALL,		"LOG_INODE_ALL")		\
+	EMe(LOG_INODE_EXISTS,		"LOG_INODE_EXISTS")
 /*
  * First define the enums in the above macros to be exported to userspace via
  * TRACE_DEFINE_ENUM().
@@ -118,6 +144,8 @@ FI_TYPES
 QGROUP_RSV_TYPES
 IO_TREE_OWNER
 FLUSH_STATES
+TRANSACTION_STATES
+LOG_MODES
 
 /*
  * Now redefine the EM and EMe macros to map the enums to the strings that will
@@ -180,25 +208,66 @@ FLUSH_STATES
 #define TP_printk_btrfs(fmt, args...) \
 	TP_printk("%pU: " fmt, __entry->fsid, args)
 
+TRACE_EVENT(btrfs_transaction_start,
+
+	TP_PROTO(const struct btrfs_transaction *trans),
+
+	TP_ARGS(trans),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,  generation		)
+	),
+
+	TP_fast_assign_btrfs(trans->fs_info,
+		__entry->generation	= trans->transid;
+	),
+
+	TP_printk_btrfs("gen=%llu", __entry->generation)
+);
+
 TRACE_EVENT(btrfs_transaction_commit,
 
-	TP_PROTO(const struct btrfs_fs_info *fs_info),
+	TP_PROTO(const struct btrfs_trans_handle *trans),
 
-	TP_ARGS(fs_info),
+	TP_ARGS(trans),
 
 	TP_STRUCT__entry_btrfs(
 		__field(	u64,  generation		)
-		__field(	u64,  root_objectid		)
+		__field(	bool, in_fsync			)
+		__field(	int,  state			)
 	),
 
-	TP_fast_assign_btrfs(fs_info,
-		__entry->generation	= fs_info->generation;
-		__entry->root_objectid	= BTRFS_ROOT_TREE_OBJECTID;
+	TP_fast_assign_btrfs(trans->fs_info,
+		__entry->generation	= trans->transid;
+		__entry->in_fsync	= trans->in_fsync;
+		__entry->state		= trans->transaction->state;
 	),
 
-	TP_printk_btrfs("root=%llu(%s) gen=%llu",
-		  show_root_type(__entry->root_objectid),
-		  __entry->generation)
+	TP_printk_btrfs("gen=%llu in_fsync=%d state=%d(%s)", __entry->generation,
+			__entry->in_fsync, __entry->state,
+			__print_symbolic(__entry->state, TRANSACTION_STATES))
+);
+
+TRACE_EVENT(btrfs_transaction_abort,
+
+	TP_PROTO(const struct btrfs_trans_handle *trans),
+
+	TP_ARGS(trans),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,  generation		)
+		__field(	bool, in_fsync			)
+		__field(	int,  error			)
+	),
+
+	TP_fast_assign_btrfs(trans->fs_info,
+		__entry->generation	= trans->transid;
+		__entry->in_fsync	= trans->in_fsync;
+		__entry->error	        = trans->aborted;
+	),
+
+	TP_printk_btrfs("gen=%llu in_fsync=%d error=%d", __entry->generation,
+			__entry->in_fsync, __entry->error)
 );
 
 DECLARE_EVENT_CLASS(btrfs__inode,
@@ -670,7 +739,7 @@ TRACE_EVENT(btrfs_finish_ordered_extent,
 	TP_printk_btrfs("root=%llu(%s) ino=%llu start=%llu len=%llu uptodate=%d",
 		  show_root_type(__entry->root_objectid),
 		  __entry->ino, __entry->start,
-		  __entry->len, !!__entry->uptodate)
+		  __entry->len, __entry->uptodate)
 );
 
 DECLARE_EVENT_CLASS(btrfs__writepage,
@@ -755,17 +824,18 @@ TRACE_EVENT(btrfs_writepage_end_io_hook,
 		  __entry->end, __entry->uptodate)
 );
 
-TRACE_EVENT(btrfs_sync_file,
+TRACE_EVENT(btrfs_sync_file_enter,
 
 	TP_PROTO(const struct file *file, int datasync),
 
 	TP_ARGS(file, datasync),
 
 	TP_STRUCT__entry_btrfs(
-		__field(	u64,	ino		)
-		__field(	u64,	parent		)
-		__field(	int,    datasync	)
-		__field(	u64,    root_objectid	)
+		__field(	u64,		ino		)
+		__field(        umode_t,        mode            )
+		__field(	u64,		parent		)
+		__field(	int,    	datasync	)
+		__field(	u64,    	root_objectid	)
 	),
 
 	TP_fast_assign(
@@ -778,13 +848,790 @@ TRACE_EVENT(btrfs_sync_file,
 		__entry->parent		= btrfs_ino(BTRFS_I(parent_inode));
 		__entry->datasync	= datasync;
 		__entry->root_objectid	= btrfs_root_id(BTRFS_I(inode)->root);
+		__entry->mode           = inode->i_mode;
 	),
 
-	TP_printk_btrfs("root=%llu(%s) ino=%llu parent=%llu datasync=%d",
-		  show_root_type(__entry->root_objectid),
-		  __entry->ino,
-		  __entry->parent,
-		  __entry->datasync)
+	TP_printk_btrfs("root=%llu(%s) ino=%llu type=%s parent=%llu datasync=%d",
+			show_root_type(__entry->root_objectid), __entry->ino,
+			show_inode_type(__entry->mode), __entry->parent,
+			__entry->datasync)
+);
+
+TRACE_EVENT(btrfs_sync_file_exit,
+
+	TP_PROTO(const struct file *file, int ret),
+
+	TP_ARGS(file, ret),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,	ino		)
+		__field(	int,    ret		)
+		__field(	u64,    root_objectid	)
+	),
+
+	TP_fast_assign(
+		struct btrfs_inode *inode = BTRFS_I(file_inode(file));
+
+		TP_fast_assign_fsid(inode->root->fs_info);
+		__entry->root_objectid	= btrfs_root_id(inode->root);
+		__entry->ino		= btrfs_ino(inode);
+		__entry->ret		= ret;
+	),
+
+	TP_printk_btrfs("root=%llu(%s) ino=%llu ret=%d",
+			show_root_type(__entry->root_objectid),
+			__entry->ino, __entry->ret)
+);
+
+TRACE_EVENT(btrfs_log_inode_parent_enter,
+
+	TP_PROTO(const struct btrfs_trans_handle *trans, struct btrfs_inode *inode),
+
+	TP_ARGS(trans, inode),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,     	root_objectid		)
+		__field(	u64,	 	ino			)
+		__field(	umode_t,	mode			)
+		__field(	u64,		transid			)
+		__field(	u64,		generation		)
+		__field(	u64,		logged_trans		)
+		__field(	u64,		last_unlink_trans	)
+		__field(	int,		last_sub_trans		)
+		__field(	int,		inode_last_log_commit	)
+		__field(	int,		root_last_log_commit	)
+	),
+
+	TP_fast_assign(
+		struct btrfs_root *root = inode->root;
+
+		TP_fast_assign_fsid(root->fs_info);
+		__entry->root_objectid		= btrfs_root_id(root);
+		__entry->ino			= btrfs_ino(inode);
+		__entry->mode			= inode->vfs_inode.i_mode;
+		__entry->transid		= trans->transid;
+		__entry->generation		= inode->generation;
+		spin_lock(&inode->lock);
+		__entry->logged_trans		= inode->logged_trans;
+		__entry->last_unlink_trans	= inode->last_unlink_trans;
+		__entry->last_sub_trans		= inode->last_sub_trans;
+		__entry->inode_last_log_commit	= inode->last_log_commit;
+		spin_unlock(&inode->lock);
+		__entry->root_last_log_commit	= btrfs_get_root_last_log_commit(root);
+	),
+
+	TP_printk_btrfs("root=%llu(%s) ino=%llu type=%s transid=%llu gen=%llu"
+			" logged_trans=%llu last_unlink_trans=%llu last_sub_trans=%d"
+			" inode_last_log_commit=%d root_last_log_commit=%d",
+			show_root_type(__entry->root_objectid), __entry->ino,
+			show_inode_type(__entry->mode), __entry->transid,
+			__entry->generation, __entry->logged_trans,
+			__entry->last_unlink_trans, __entry->last_sub_trans,
+			__entry->inode_last_log_commit, __entry->root_last_log_commit)
+);
+
+TRACE_EVENT(btrfs_log_inode_parent_exit,
+
+	TP_PROTO(const struct btrfs_trans_handle *trans,
+		 const struct btrfs_inode *inode,
+		 int ret),
+
+	TP_ARGS(trans, inode, ret),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,     	root_objectid	)
+		__field(	u64,	 	ino		)
+		__field(	u64,		transid		)
+		__field(	int,		ret		)
+	),
+
+	TP_fast_assign(
+		TP_fast_assign_fsid(inode->root->fs_info);
+		__entry->root_objectid		= btrfs_root_id(inode->root);
+		__entry->ino			= btrfs_ino(inode);
+		__entry->transid		= trans->transid;
+		__entry->ret			= ret;
+	),
+
+	TP_printk_btrfs("root=%llu(%s) ino=%llu transid=%llu ret=%d",
+			show_root_type(__entry->root_objectid), __entry->ino,
+			__entry->transid, __entry->ret)
+);
+
+TRACE_EVENT(btrfs_log_inode_enter,
+
+	TP_PROTO(const struct btrfs_trans_handle *trans, struct btrfs_inode *inode,
+		 const struct btrfs_log_ctx *ctx, int log_mode),
+
+	TP_ARGS(trans, inode, ctx, log_mode),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,     		root_objectid			)
+		__field(	u64,	 		ino				)
+		__field(	umode_t,		mode				)
+		__field(	u64,			transid				)
+		__field(	u64,			generation			)
+		__field(	u64,			logged_trans			)
+		__field(	u64,			last_unlink_trans		)
+		__field(	u64,			last_reflink_trans		)
+		__field(	int,			last_sub_trans			)
+		__field(	int,			last_log_commit			)
+		__field(	bool,			logging_new_name		)
+		__field(	bool,			logging_new_delayed_dentries	)
+		__field(        bool,			is_conflict_inode		)
+		__field(	bool,			full_sync			)
+		__field(	bool,			copy_everything			)
+		__field(	bool,			no_xattrs			)
+		__field(	int,   			log_mode			)
+	),
+
+	TP_fast_assign(
+		TP_fast_assign_fsid(inode->root->fs_info);
+		__entry->root_objectid			= btrfs_root_id(inode->root);
+		__entry->ino				= btrfs_ino(inode);
+		__entry->mode				= inode->vfs_inode.i_mode;
+		__entry->transid			= trans->transid;
+		__entry->generation			= inode->generation;
+		spin_lock(&inode->lock);
+		__entry->logged_trans			= inode->logged_trans;
+		__entry->last_unlink_trans		= inode->last_unlink_trans;
+		__entry->last_reflink_trans		= inode->last_reflink_trans;
+		__entry->last_sub_trans			= inode->last_sub_trans;
+		__entry->last_log_commit		= inode->last_log_commit;
+		spin_unlock(&inode->lock);
+		__entry->logging_new_name		= ctx->logging_new_name;
+		__entry->logging_new_delayed_dentries	= ctx->logging_new_delayed_dentries;
+		__entry->is_conflict_inode		= ctx->logging_conflict_inodes;
+		__entry->full_sync			=
+			test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
+		__entry->copy_everything		=
+			test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
+		__entry->no_xattrs			=
+			test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
+		__entry->log_mode			= log_mode;
+	),
+
+	TP_printk_btrfs("root=%llu(%s) ino=%llu type=%s transid=%llu gen=%llu"
+			" logged_trans=%llu last_unlink_trans=%llu"
+			" last_reflink_trans=%llu last_sub_trans=%d last_log_commit=%d"
+			" logging_new_name=%d logging_new_delayed_dentries=%d"
+			" is_conflict_inode=%d full_sync=%d copy_everything=%d"
+			" no_xattrs=%d log_mode=%d(%s)",
+			show_root_type(__entry->root_objectid), __entry->ino,
+			show_inode_type(__entry->mode), __entry->transid,
+			__entry->generation, __entry->logged_trans,
+			__entry->last_unlink_trans, __entry->last_reflink_trans,
+			__entry->last_sub_trans, __entry->last_log_commit,
+			__entry->logging_new_name, __entry->logging_new_delayed_dentries,
+			__entry->is_conflict_inode, __entry->log_mode,
+			__entry->full_sync, __entry->copy_everything, __entry->no_xattrs,
+			__print_symbolic(__entry->log_mode, LOG_MODES))
+);
+
+TRACE_EVENT(btrfs_log_inode_exit,
+
+	TP_PROTO(const struct btrfs_trans_handle *trans, struct btrfs_inode *inode,
+		 int ret),
+
+	TP_ARGS(trans, inode, ret),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,    	root_objectid		)
+		__field(	u64,		ino			)
+		__field(	u64,		transid			)
+		__field(	u64,		logged_trans		)
+		__field(	u64,		last_reflink_trans	)
+		__field(	int,		last_sub_trans		)
+		__field(	int,		last_log_commit		)
+		__field(	int,		ret			)
+	),
+
+	TP_fast_assign(
+		TP_fast_assign_fsid(inode->root->fs_info);
+		__entry->root_objectid			= btrfs_root_id(inode->root);
+		__entry->ino				= btrfs_ino(inode);
+		__entry->transid			= trans->transid;
+		spin_lock(&inode->lock);
+		__entry->logged_trans			= inode->logged_trans;
+		__entry->last_reflink_trans		= inode->last_reflink_trans;
+		__entry->last_sub_trans			= inode->last_sub_trans;
+		__entry->last_log_commit		= inode->last_log_commit;
+		spin_unlock(&inode->lock);
+		__entry->ret				= ret;
+	),
+
+	TP_printk_btrfs("root=%llu(%s) ino=%llu transid=%llu logged_trans=%llu"
+			" last_reflink_trans=%llu last_sub_trans=%d"
+			" last_log_commit=%d ret=%d",
+			show_root_type(__entry->root_objectid), __entry->ino,
+			__entry->transid, __entry->logged_trans,
+			__entry->last_reflink_trans, __entry->last_sub_trans,
+			__entry->last_log_commit, __entry->ret)
+);
+
+TRACE_EVENT(btrfs_log_all_parents_enter,
+
+	TP_PROTO(const struct btrfs_trans_handle *trans,
+		 const struct btrfs_inode *inode),
+
+	TP_ARGS(trans, inode),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,     	root_objectid		)
+		__field(	u64,	 	ino			)
+		__field(	u64,		transid			)
+	),
+
+	TP_fast_assign(
+		TP_fast_assign_fsid(inode->root->fs_info);
+		__entry->root_objectid		= btrfs_root_id(inode->root);
+		__entry->ino			= btrfs_ino(inode);
+		__entry->transid		= trans->transid;
+	),
+
+	TP_printk_btrfs("root=%llu(%s) ino=%llu transid=%llu",
+			show_root_type(__entry->root_objectid), __entry->ino,
+			__entry->transid)
+);
+
+TRACE_EVENT(btrfs_log_all_parents_exit,
+
+	TP_PROTO(const struct btrfs_trans_handle *trans,
+		 const struct btrfs_inode *inode,
+		 int ret),
+
+	TP_ARGS(trans, inode, ret),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,     	root_objectid		)
+		__field(	u64,	 	ino			)
+		__field(	u64,		transid			)
+		__field(	int,		ret			)
+	),
+
+	TP_fast_assign(
+		TP_fast_assign_fsid(inode->root->fs_info);
+		__entry->root_objectid		= btrfs_root_id(inode->root);
+		__entry->ino			= btrfs_ino(inode);
+		__entry->transid		= trans->transid;
+		__entry->ret			= ret;
+	),
+
+	TP_printk_btrfs("root=%llu(%s) ino=%llu transid=%llu ret=%d",
+			show_root_type(__entry->root_objectid), __entry->ino,
+			__entry->transid, __entry->ret)
+);
+
+TRACE_EVENT(btrfs_log_all_new_ancestors_enter,
+
+	TP_PROTO(const struct btrfs_trans_handle *trans,
+		 const struct btrfs_inode *inode),
+
+	TP_ARGS(trans, inode),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,     	root_objectid		)
+		__field(	u64,	 	ino			)
+		__field(	u64,		transid			)
+		__field(	unsigned int,	nlink			)
+	),
+
+	TP_fast_assign(
+		TP_fast_assign_fsid(inode->root->fs_info);
+		__entry->root_objectid		= btrfs_root_id(inode->root);
+		__entry->ino			= btrfs_ino(inode);
+		__entry->transid		= trans->transid;
+		__entry->nlink			= inode->vfs_inode.i_nlink;
+	),
+
+	TP_printk_btrfs("root=%llu(%s) ino=%llu transid=%llu nlink=%u",
+			show_root_type(__entry->root_objectid), __entry->ino,
+			__entry->transid, __entry->nlink)
+);
+
+TRACE_EVENT(btrfs_log_all_new_ancestors_exit,
+
+	TP_PROTO(const struct btrfs_trans_handle *trans,
+		 const struct btrfs_inode *inode,
+		 int ret),
+
+	TP_ARGS(trans, inode, ret),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,     	root_objectid		)
+		__field(	u64,	 	ino			)
+		__field(	u64,		transid			)
+		__field(	int,		ret			)
+	),
+
+	TP_fast_assign(
+		TP_fast_assign_fsid(inode->root->fs_info);
+		__entry->root_objectid		= btrfs_root_id(inode->root);
+		__entry->ino			= btrfs_ino(inode);
+		__entry->transid		= trans->transid;
+		__entry->ret		= ret;
+	),
+
+	TP_printk_btrfs("root=%llu(%s) ino=%llu transid=%llu ret=%d",
+			show_root_type(__entry->root_objectid), __entry->ino,
+			__entry->transid, __entry->ret)
+);
+
+TRACE_EVENT(btrfs_log_new_dir_dentries_enter,
+
+	TP_PROTO(const struct btrfs_trans_handle *trans,
+		 const struct btrfs_inode *inode),
+
+	TP_ARGS(trans, inode),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,     	root_objectid		)
+		__field(	u64,	 	ino			)
+		__field(	u64,		transid			)
+	),
+
+	TP_fast_assign(
+		TP_fast_assign_fsid(inode->root->fs_info);
+		__entry->root_objectid		= btrfs_root_id(inode->root);
+		__entry->ino			= btrfs_ino(inode);
+		__entry->transid		= trans->transid;
+	),
+
+	TP_printk_btrfs("root=%llu(%s) ino=%llu transid=%llu",
+			show_root_type(__entry->root_objectid), __entry->ino,
+			__entry->transid)
+);
+
+TRACE_EVENT(btrfs_log_new_dir_dentries_exit,
+
+	TP_PROTO(const struct btrfs_trans_handle *trans,
+		 const struct btrfs_inode *inode,
+		 int ret),
+
+	TP_ARGS(trans, inode, ret),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,     	root_objectid		)
+		__field(	u64,	 	ino			)
+		__field(	u64,		transid			)
+		__field(	int,	 	ret			)
+	),
+
+	TP_fast_assign(
+		TP_fast_assign_fsid(inode->root->fs_info);
+		__entry->root_objectid		= btrfs_root_id(inode->root);
+		__entry->ino			= btrfs_ino(inode);
+		__entry->transid		= trans->transid;
+		__entry->ret			= ret;
+	),
+
+	TP_printk_btrfs("root=%llu(%s) ino=%llu transid=%llu ret=%d",
+			show_root_type(__entry->root_objectid), __entry->ino,
+			__entry->transid, __entry->ret)
+);
+
+TRACE_EVENT(btrfs_add_conflicting_inode_enter,
+
+	TP_PROTO(const struct btrfs_trans_handle *trans,
+		 const struct btrfs_log_ctx *ctx,
+		 u64 ino, u64 parent),
+
+	TP_ARGS(trans, ctx, ino, parent),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,     	root_objectid		)
+		__field(	u64,		transid			)
+		__field(	u64,	 	ctx_ino			)
+		__field(	u64,	 	conflict_ino		)
+		__field(	u64,	 	conflict_ino_parent	)
+	),
+
+	TP_fast_assign(
+		TP_fast_assign_fsid(trans->fs_info);
+		__entry->root_objectid		= btrfs_root_id(ctx->inode->root);
+		__entry->transid		= trans->transid;
+		__entry->ctx_ino		= btrfs_ino(ctx->inode);
+		__entry->conflict_ino		= ino;
+		__entry->conflict_ino_parent	= parent;
+	),
+
+	TP_printk_btrfs("root=%llu(%s) transid=%llu ctx_ino=%llu conflict_ino=%llu"
+			" conflict_ino_parent=%llu",
+			show_root_type(__entry->root_objectid), __entry->transid,
+			__entry->ctx_ino, __entry->conflict_ino,
+			__entry->conflict_ino_parent)
+);
+
+TRACE_EVENT(btrfs_add_conflicting_inode_exit,
+
+	TP_PROTO(const struct btrfs_trans_handle *trans,
+		 const struct btrfs_log_ctx *ctx,
+		 u64 ino, u64 parent, int ret),
+
+	TP_ARGS(trans, ctx, ino, parent, ret),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,     	root_objectid		)
+		__field(	u64,		transid			)
+		__field(	u64,	 	ctx_ino			)
+		__field(	u64,	 	conflict_ino		)
+		__field(	u64,	 	conflict_ino_parent	)
+		__field(	int,	 	ret			)
+	),
+
+	TP_fast_assign(
+		TP_fast_assign_fsid(trans->fs_info);
+		__entry->root_objectid		= btrfs_root_id(ctx->inode->root);
+		__entry->transid		= trans->transid;
+		__entry->ctx_ino		= btrfs_ino(ctx->inode);
+		__entry->conflict_ino		= ino;
+		__entry->conflict_ino_parent	= parent;
+		__entry->ret			= ret;
+	),
+
+	TP_printk_btrfs("root=%llu(%s) transid=%llu ctx_ino=%llu conflict_ino=%llu"
+			" conflict_ino_parent=%llu ret=%d",
+			show_root_type(__entry->root_objectid), __entry->transid,
+			__entry->ctx_ino, __entry->conflict_ino,
+			__entry->conflict_ino_parent, __entry->ret)
+);
+
+TRACE_EVENT(btrfs_log_conflicting_inodes_enter,
+
+	TP_PROTO(const struct btrfs_trans_handle *trans,
+		 const struct btrfs_log_ctx *ctx),
+
+	TP_ARGS(trans, ctx),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,     	root_objectid		)
+		__field(	u64,		transid			)
+		__field(	u64,	 	ctx_ino			)
+	),
+
+	TP_fast_assign(
+		TP_fast_assign_fsid(trans->fs_info);
+		__entry->root_objectid		= btrfs_root_id(ctx->inode->root);
+		__entry->transid		= trans->transid;
+		__entry->ctx_ino		= btrfs_ino(ctx->inode);
+	),
+
+	TP_printk_btrfs("root=%llu(%s) transid=%llu ctx_ino=%llu",
+			show_root_type(__entry->root_objectid), __entry->transid,
+			__entry->ctx_ino)
+);
+
+TRACE_EVENT(btrfs_log_conflicting_inodes_exit,
+
+	TP_PROTO(const struct btrfs_trans_handle *trans,
+		 const struct btrfs_log_ctx *ctx,
+		 int ret),
+
+	TP_ARGS(trans, ctx, ret),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,     	root_objectid		)
+		__field(	u64,		transid			)
+		__field(	u64,	 	ctx_ino			)
+		__field(	int,		ret			)
+	),
+
+	TP_fast_assign(
+		TP_fast_assign_fsid(trans->fs_info);
+		__entry->root_objectid		= btrfs_root_id(ctx->inode->root);
+		__entry->transid		= trans->transid;
+		__entry->ctx_ino		= btrfs_ino(ctx->inode);
+		__entry->ret			= ret;
+	),
+
+	TP_printk_btrfs("root=%llu(%s) transid=%llu ctx_ino=%llu ret=%d",
+			show_root_type(__entry->root_objectid), __entry->transid,
+			__entry->ctx_ino, __entry->ret)
+);
+
+TRACE_EVENT(btrfs_log_new_delayed_dentries_enter,
+
+	TP_PROTO(const struct btrfs_trans_handle *trans,
+		 const struct btrfs_inode *inode),
+
+	TP_ARGS(trans, inode),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,     	root_objectid		)
+		__field(	u64,		transid			)
+		__field(	u64,	 	ino			)
+	),
+
+	TP_fast_assign(
+		TP_fast_assign_fsid(trans->fs_info);
+		__entry->root_objectid		= btrfs_root_id(inode->root);
+		__entry->transid		= trans->transid;
+		__entry->ino			= btrfs_ino(inode);
+	),
+
+	TP_printk_btrfs("root=%llu(%s) transid=%llu ino=%llu",
+			show_root_type(__entry->root_objectid), __entry->transid,
+			__entry->ino)
+);
+
+TRACE_EVENT(btrfs_log_new_delayed_dentries_exit,
+
+	TP_PROTO(const struct btrfs_trans_handle *trans,
+		 const struct btrfs_inode *inode,
+		 int ret),
+
+	TP_ARGS(trans, inode, ret),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,     	root_objectid		)
+		__field(	u64,		transid			)
+		__field(	u64,	 	ino			)
+		__field(	int,	 	ret			)
+	),
+
+	TP_fast_assign(
+		TP_fast_assign_fsid(trans->fs_info);
+		__entry->root_objectid		= btrfs_root_id(inode->root);
+		__entry->transid		= trans->transid;
+		__entry->ino			= btrfs_ino(inode);
+		__entry->ret			= ret;
+	),
+
+	TP_printk_btrfs("root=%llu(%s) transid=%llu ino=%llu ret=%d",
+			show_root_type(__entry->root_objectid), __entry->transid,
+			__entry->ino, __entry->ret)
+);
+
+TRACE_EVENT(btrfs_record_unlink_dir,
+
+	TP_PROTO(const struct btrfs_trans_handle *trans,
+		 const struct btrfs_inode *dir,
+		 const struct btrfs_inode *inode,
+		 bool for_rename),
+
+	TP_ARGS(trans, dir, inode, for_rename),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,     	root_objectid		)
+		__field(	u64,		transid			)
+		__field(	u64,	 	ino			)
+		__field(	u64,	 	dir			)
+		__field(	bool,	 	for_rename		)
+	),
+
+	TP_fast_assign(
+		TP_fast_assign_fsid(trans->fs_info);
+		__entry->root_objectid		= btrfs_root_id(inode->root);
+		__entry->transid		= trans->transid;
+		__entry->ino			= btrfs_ino(inode);
+		__entry->dir			= btrfs_ino(dir);
+		__entry->for_rename		= for_rename;
+	),
+
+	TP_printk_btrfs("root=%llu(%s) transid=%llu ino=%llu dir=%llu for_rename=%d",
+			show_root_type(__entry->root_objectid), __entry->transid,
+			__entry->ino, __entry->dir, __entry->for_rename)
+);
+
+TRACE_EVENT(btrfs_record_snapshot_destroy,
+
+	TP_PROTO(const struct btrfs_trans_handle *trans,
+		 const struct btrfs_inode *dir),
+
+	TP_ARGS(trans, dir),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,     	root_objectid		)
+		__field(	u64,		transid			)
+		__field(	u64,	 	dir			)
+	),
+
+	TP_fast_assign(
+		TP_fast_assign_fsid(trans->fs_info);
+		__entry->root_objectid		= btrfs_root_id(dir->root);
+		__entry->transid		= trans->transid;
+		__entry->dir			= btrfs_ino(dir);
+	),
+
+	TP_printk_btrfs("root=%llu(%s) transid=%llu dir=%llu",
+			show_root_type(__entry->root_objectid), __entry->transid,
+			__entry->dir)
+);
+
+TRACE_EVENT(btrfs_record_new_subvolume,
+
+	TP_PROTO(const struct btrfs_trans_handle *trans,
+		 const struct btrfs_inode *dir),
+
+	TP_ARGS(trans, dir),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,     	root_objectid		)
+		__field(	u64,		transid			)
+		__field(	u64,	 	dir			)
+	),
+
+	TP_fast_assign(
+		TP_fast_assign_fsid(trans->fs_info);
+		__entry->root_objectid		= btrfs_root_id(dir->root);
+		__entry->transid		= trans->transid;
+		__entry->dir			= btrfs_ino(dir);
+	),
+
+	TP_printk_btrfs("root=%llu(%s) transid=%llu dir=%llu",
+			show_root_type(__entry->root_objectid), __entry->transid,
+			__entry->dir)
+);
+
+TRACE_EVENT(btrfs_log_new_name_enter,
+
+	TP_PROTO(const struct btrfs_trans_handle *trans,
+		 const struct btrfs_inode *inode,
+		 const struct btrfs_inode *old_dir,
+		 u64 old_dir_index),
+
+	TP_ARGS(trans, inode, old_dir, old_dir_index),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,     	root_objectid		)
+		__field(	u64,		transid			)
+		__field(	u64,	 	ino			)
+		__field(	umode_t,	mode			)
+		__field(	u64,	 	old_dir_ino		)
+		__field(	u64,		old_dir_index		)
+	),
+
+	TP_fast_assign(
+		TP_fast_assign_fsid(trans->fs_info);
+		__entry->root_objectid		= btrfs_root_id(inode->root);
+		__entry->transid		= trans->transid;
+		__entry->ino			= btrfs_ino(inode);
+		__entry->mode			= inode->vfs_inode.i_mode;
+		__entry->old_dir_ino		= old_dir ? btrfs_ino(old_dir) : 0;
+		__entry->old_dir_index		= old_dir_index;
+	),
+
+	TP_printk_btrfs("root=%llu(%s) transid=%llu ino=%llu type=%s"
+			" old_dir=%llu old_dir_index=%llu",
+			show_root_type(__entry->root_objectid), __entry->transid,
+			__entry->ino, show_inode_type(__entry->mode),
+			__entry->old_dir_ino, __entry->old_dir_index)
+);
+
+TRACE_EVENT(btrfs_log_new_name_exit,
+
+	TP_PROTO(const struct btrfs_trans_handle *trans,
+		 const struct btrfs_inode *inode,
+		 const struct btrfs_inode *old_dir,
+		 int ret),
+
+	TP_ARGS(trans, inode, old_dir, ret),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,     	root_objectid		)
+		__field(	u64,		transid			)
+		__field(	u64,	 	ino			)
+		__field(	u64,	 	old_dir_ino		)
+		__field(	int,		ret			)
+	),
+
+	TP_fast_assign(
+		TP_fast_assign_fsid(trans->fs_info);
+		__entry->root_objectid		= btrfs_root_id(inode->root);
+		__entry->transid		= trans->transid;
+		__entry->ino			= btrfs_ino(inode);
+		__entry->old_dir_ino		= old_dir ? btrfs_ino(old_dir) : 0;
+		__entry->ret			= ret;
+	),
+
+	TP_printk_btrfs("root=%llu(%s) transid=%llu ino=%llu old_dir=%llu ret=%d",
+			show_root_type(__entry->root_objectid), __entry->transid,
+			__entry->ino, __entry->old_dir_ino, __entry->ret)
+);
+
+/* Ideally call this while under root->log_mutex (but not always possible). */
+TRACE_EVENT(btrfs_sync_log_enter,
+
+	TP_PROTO(const struct btrfs_trans_handle *trans,
+		 const struct btrfs_root *root,
+		 const struct btrfs_log_ctx *ctx),
+
+	TP_ARGS(trans, root, ctx),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,     	root_objectid		)
+		__field(	u64,		transid			)
+		__field(	int,	 	ctx_log_transid		)
+		__field(	int,		root_log_transid	)
+		__field(	int,	 	log_transid_committed	)
+		__field(	bool,	 	log_committing		)
+		__field(	bool,	 	log_committing_prev	)
+		__field(	int,		log_writers		)
+	),
+
+	TP_fast_assign(
+		TP_fast_assign_fsid(trans->fs_info);
+		__entry->root_objectid		= btrfs_root_id(root);
+		__entry->transid		= trans->transid;
+		__entry->ctx_log_transid	= ctx->log_transid;
+		__entry->root_log_transid	= btrfs_get_root_log_transid(root);
+		__entry->log_transid_committed	=
+			data_race(root->log_transid_committed);
+		__entry->log_committing		=
+			atomic_read(&root->log_commit[ctx->log_transid % 2]);
+		__entry->log_committing_prev	=
+			atomic_read(&root->log_commit[(ctx->log_transid + 1) % 2]);
+		__entry->log_writers		= atomic_read(&root->log_writers);
+	),
+
+	TP_printk_btrfs("root=%llu(%s) transid=%llu ctx_log_transid=%d"
+			" root_log_transid=%d log_transid_committed=%d"
+			" log_committing=%d log_committing_prev=%d log_writers=%d",
+			show_root_type(__entry->root_objectid), __entry->transid,
+			__entry->ctx_log_transid, __entry->root_log_transid,
+			__entry->log_transid_committed, __entry->log_committing,
+			__entry->log_committing_prev, __entry->log_writers)
+);
+
+/*
+ * Ideally call this while under root->log_mutex and in the same critical
+ * section that calls the btrfs_sync_log_enter() trace event (though it's not
+ * always possible).
+ */
+TRACE_EVENT(btrfs_sync_log_exit,
+
+	TP_PROTO(const struct btrfs_trans_handle *trans,
+		 const struct btrfs_root *root,
+		 const struct btrfs_log_ctx *ctx,
+		 int ret),
+
+	TP_ARGS(trans, root, ctx, ret),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,     	root_objectid		)
+		__field(	u64,		transid			)
+		__field(	int,	 	ctx_log_transid		)
+		__field(	int,		root_log_transid	)
+		__field(	int,	 	log_transid_committed	)
+		__field(	int,		ret			)
+	),
+
+	TP_fast_assign(
+		TP_fast_assign_fsid(trans->fs_info);
+		__entry->root_objectid		= btrfs_root_id(root);
+		__entry->transid		= trans->transid;
+		__entry->ctx_log_transid	= ctx->log_transid;
+		__entry->root_log_transid	= btrfs_get_root_log_transid(root);
+		__entry->log_transid_committed	=
+			data_race(root->log_transid_committed);
+		__entry->ret			= ret;
+	),
+
+	TP_printk_btrfs("root=%llu(%s) transid=%llu ctx_log_transid=%d"
+			" root_log_transid=%d log_transid_committed=%d ret=%d",
+			show_root_type(__entry->root_objectid), __entry->transid,
+			__entry->ctx_log_transid, __entry->root_log_transid,
+			__entry->log_transid_committed, __entry->ret)
 );
 
 TRACE_EVENT(btrfs_sync_fs,
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 9165154a274d..9b576603b3f1 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -1100,6 +1100,38 @@ enum btrfs_err_code {
 	BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
 };
 
+/* Flags for struct btrfs_ioctl_get_csums_entry::type. */
+#define BTRFS_GET_CSUMS_HAS_CSUMS			(1U << 0)
+#define BTRFS_GET_CSUMS_ZEROED				(1U << 1)
+#define BTRFS_GET_CSUMS_NODATASUM			(1U << 2)
+#define BTRFS_GET_CSUMS_COMPRESSED			(1U << 3)
+#define BTRFS_GET_CSUMS_ENCRYPTED			(1U << 4)
+#define BTRFS_GET_CSUMS_INLINE				(1U << 5)
+
+struct btrfs_ioctl_get_csums_entry {
+	/* File offset of this range. */
+	__u64 offset;
+	/* Length in bytes. */
+	__u64 length;
+	/* One of BTRFS_GET_CSUMS_* types. */
+	__u32 type;
+	/* Padding, must be 0. */
+	__u32 reserved;
+};
+
+struct btrfs_ioctl_get_csums_args {
+	/* In/out: file offset in bytes. */
+	__u64 offset;
+	/* In/out: range length in bytes. */
+	__u64 length;
+	/* In/out: buffer capacity / bytes written. */
+	__u64 buf_size;
+	/* In: flags, must be 0 for now. */
+	__u64 flags;
+	/* Out: entries of type btrfs_ioctl_get_csums_entry + csum data */
+	__u8 buf[];
+};
+
 /* Flags for IOC_SHUTDOWN, must match XFS_FSOP_GOING_FLAGS_* flags. */
 #define BTRFS_SHUTDOWN_FLAGS_DEFAULT			0x0
 #define BTRFS_SHUTDOWN_FLAGS_LOGFLUSH			0x1
@@ -1226,6 +1258,8 @@ enum btrfs_err_code {
 				     struct btrfs_ioctl_encoded_io_args)
 #define BTRFS_IOC_SUBVOL_SYNC_WAIT _IOW(BTRFS_IOCTL_MAGIC, 65, \
 					struct btrfs_ioctl_subvol_wait)
+#define BTRFS_IOC_GET_CSUMS _IOWR(BTRFS_IOCTL_MAGIC, 66, \
+				  struct btrfs_ioctl_get_csums_args)
 
 /* Shutdown ioctl should follow XFS's interfaces, thus not using btrfs magic. */
 #define BTRFS_IOC_SHUTDOWN	_IOR('X', 125, __u32)
author	Linus Torvalds <torvalds@linux-foundation.org>	2026-06-16 09:38:02 +0300
committer	Linus Torvalds <torvalds@linux-foundation.org>	2026-06-16 09:38:02 +0300
commit	31b706da2cfd8ee3352391181ccf9696bed3d25d (patch)
tree	c76b0a7317b3b58f5638b9c7cabd8f640ce5be56
parent	477c122f8c1d5d9f57c4f9c1f7a1631beaa38bcc (diff)
parent	ae2eb64bfd9762536f60b690840adcdf622cdcce (diff)
download	linux-31b706da2cfd8ee3352391181ccf9696bed3d25d.tar.xz