From 5a80d1c6a2704d880c1df30315a1d9b0dc1f2cd8 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Tue, 29 Jun 2021 03:36:45 +0900
Subject: btrfs: zoned: remove max_zone_append_size logic

There used to be a patch in the original series for zoned support which
limited the extent size to max_zone_append_size, but this patch has been
dropped somewhere around v9.

We've decided to go the opposite direction, instead of limiting extents
in the first place we split them before submission to comply with the
device's limits.

Remove the related code, btrfs_fs_info::max_zone_append_size and
btrfs_zoned_device_info::max_zone_append_size.

This also removes the workaround for dm-crypt introduced in
1d68128c107a ("btrfs: zoned: fail mount if the device does not support
zone append") because the fix has been merged as f34ee1dce642 ("dm
crypt: Fix zoned block device support").

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h     |  2 --
 fs/btrfs/extent_io.c |  1 -
 fs/btrfs/zoned.c     | 20 --------------------
 fs/btrfs/zoned.h     |  1 -
 4 files changed, 24 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e5e53e592d4f..4a69aa604ac5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1012,8 +1012,6 @@ struct btrfs_fs_info {
 		u64 zoned;
 	};
 
-	/* Max size to emit ZONE_APPEND write command */
-	u64 max_zone_append_size;
 	struct mutex zoned_meta_io_lock;
 	spinlock_t treelog_bg_lock;
 	u64 treelog_bg;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9e81d25dea70..1f947e24091a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3266,7 +3266,6 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
 		return 0;
 	}
 
-	ASSERT(fs_info->max_zone_append_size > 0);
 	/* Ordered extent not yet created, so we're good */
 	ordered = btrfs_lookup_ordered_extent(inode, logical);
 	if (!ordered) {
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 907c2cc45c9c..4f8bfceda095 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -296,7 +296,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
 	struct btrfs_fs_info *fs_info = device->fs_info;
 	struct btrfs_zoned_device_info *zone_info = NULL;
 	struct block_device *bdev = device->bdev;
-	struct request_queue *queue = bdev_get_queue(bdev);
 	sector_t nr_sectors;
 	sector_t sector = 0;
 	struct blk_zone *zones = NULL;
@@ -348,19 +347,10 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
 
 	nr_sectors = bdev_nr_sectors(bdev);
 	zone_info->zone_size_shift = ilog2(zone_info->zone_size);
-	zone_info->max_zone_append_size =
-		(u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT;
 	zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
 	if (!IS_ALIGNED(nr_sectors, zone_sectors))
 		zone_info->nr_zones++;
 
-	if (bdev_is_zoned(bdev) && zone_info->max_zone_append_size == 0) {
-		btrfs_err(fs_info, "zoned: device %pg does not support zone append",
-			  bdev);
-		ret = -EINVAL;
-		goto out;
-	}
-
 	zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
 	if (!zone_info->seq_zones) {
 		ret = -ENOMEM;
@@ -529,7 +519,6 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
 	u64 zoned_devices = 0;
 	u64 nr_devices = 0;
 	u64 zone_size = 0;
-	u64 max_zone_append_size = 0;
 	const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED);
 	int ret = 0;
 
@@ -565,11 +554,6 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
 				ret = -EINVAL;
 				goto out;
 			}
-			if (!max_zone_append_size ||
-			    (zone_info->max_zone_append_size &&
-			     zone_info->max_zone_append_size < max_zone_append_size))
-				max_zone_append_size =
-					zone_info->max_zone_append_size;
 		}
 		nr_devices++;
 	}
@@ -619,7 +603,6 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
 	}
 
 	fs_info->zone_size = zone_size;
-	fs_info->max_zone_append_size = max_zone_append_size;
 	fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
 
 	/*
@@ -1318,9 +1301,6 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
 	if (!btrfs_is_zoned(fs_info))
 		return false;
 
-	if (!fs_info->max_zone_append_size)
-		return false;
-
 	if (!is_data_inode(&inode->vfs_inode))
 		return false;
 
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index b0ae2608cb6b..4b299705bb12 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -22,7 +22,6 @@ struct btrfs_zoned_device_info {
 	 */
 	u64 zone_size;
 	u8  zone_size_shift;
-	u64 max_zone_append_size;
 	u32 nr_zones;
 	unsigned long *seq_zones;
 	unsigned long *empty_zones;
-- 
cgit v1.2.3


From b3b7e1d0b4c2d80e7be8248305f6f47e46329e7e Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 23 Jun 2021 15:48:53 +0200
Subject: btrfs: add special case to setget helpers for 64k pages

On 64K pages the size of the extent_buffer::pages array is 1 and
compilation with -Warray-bounds warns due to

  kaddr = page_address(eb->pages[idx + 1]);

when reading byte range crossing page boundary.

This does never actually overflow the array because on 64K because all
the data fit in one page and bounds are checked by check_setget_bounds.

To fix the reported overflows and warnings add a compile-time condition
that will allow compiler to eliminate the dead code that reads from the
idx + 1 page.

Link: https://lore.kernel.org/lkml/20210623083901.1d49d19d@canb.auug.org.au/
CC: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/struct-funcs.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index 8260f8bb3ff0..f429256f56db 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -73,7 +73,7 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token,		\
 	}								\
 	token->kaddr = page_address(token->eb->pages[idx]);		\
 	token->offset = idx << PAGE_SHIFT;				\
-	if (oip + size <= PAGE_SIZE)					\
+	if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \
 		return get_unaligned_le##bits(token->kaddr + oip);	\
 									\
 	memcpy(lebytes, token->kaddr + oip, part);			\
@@ -94,7 +94,7 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb,		\
 	u8 lebytes[sizeof(u##bits)];					\
 									\
 	ASSERT(check_setget_bounds(eb, ptr, off, size));		\
-	if (oip + size <= PAGE_SIZE)					\
+	if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE)	\
 		return get_unaligned_le##bits(kaddr + oip);		\
 									\
 	memcpy(lebytes, kaddr + oip, part);				\
@@ -124,7 +124,7 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token,		\
 	}								\
 	token->kaddr = page_address(token->eb->pages[idx]);		\
 	token->offset = idx << PAGE_SHIFT;				\
-	if (oip + size <= PAGE_SIZE) {					\
+	if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
 		put_unaligned_le##bits(val, token->kaddr + oip);	\
 		return;							\
 	}								\
@@ -146,7 +146,7 @@ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr,	\
 	u8 lebytes[sizeof(u##bits)];					\
 									\
 	ASSERT(check_setget_bounds(eb, ptr, off, size));		\
-	if (oip + size <= PAGE_SIZE) {					\
+	if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
 		put_unaligned_le##bits(val, kaddr + oip);		\
 		return;							\
 	}								\
-- 
cgit v1.2.3


From 4a9531cf89d29de82ef157513e593e58f49ef8f4 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Mon, 5 Jul 2021 17:06:18 +0800
Subject: btrfs: check-integrity: drop unnecessary function prototypes

The function prototypes below aren't necessary as the functions are
first defined before called. Remove them.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/check-integrity.c | 49 ----------------------------------------------
 1 file changed, 49 deletions(-)

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 169508609324..9cd88dfc5f8a 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -243,47 +243,6 @@ struct btrfsic_state {
 	u32 datablock_size;
 };
 
-static void btrfsic_block_init(struct btrfsic_block *b);
-static struct btrfsic_block *btrfsic_block_alloc(void);
-static void btrfsic_block_free(struct btrfsic_block *b);
-static void btrfsic_block_link_init(struct btrfsic_block_link *n);
-static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
-static void btrfsic_block_link_free(struct btrfsic_block_link *n);
-static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
-static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
-static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
-static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
-static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
-					struct btrfsic_block_hashtable *h);
-static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
-static struct btrfsic_block *btrfsic_block_hashtable_lookup(
-		struct block_device *bdev,
-		u64 dev_bytenr,
-		struct btrfsic_block_hashtable *h);
-static void btrfsic_block_link_hashtable_init(
-		struct btrfsic_block_link_hashtable *h);
-static void btrfsic_block_link_hashtable_add(
-		struct btrfsic_block_link *l,
-		struct btrfsic_block_link_hashtable *h);
-static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
-static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
-		struct block_device *bdev_ref_to,
-		u64 dev_bytenr_ref_to,
-		struct block_device *bdev_ref_from,
-		u64 dev_bytenr_ref_from,
-		struct btrfsic_block_link_hashtable *h);
-static void btrfsic_dev_state_hashtable_init(
-		struct btrfsic_dev_state_hashtable *h);
-static void btrfsic_dev_state_hashtable_add(
-		struct btrfsic_dev_state *ds,
-		struct btrfsic_dev_state_hashtable *h);
-static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
-static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev,
-		struct btrfsic_dev_state_hashtable *h);
-static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
-static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
-static int btrfsic_process_superblock(struct btrfsic_state *state,
-				      struct btrfs_fs_devices *fs_devices);
 static int btrfsic_process_metablock(struct btrfsic_state *state,
 				     struct btrfsic_block *block,
 				     struct btrfsic_block_data_ctx *block_ctx,
@@ -313,14 +272,6 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
 static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
 static int btrfsic_read_block(struct btrfsic_state *state,
 			      struct btrfsic_block_data_ctx *block_ctx);
-static void btrfsic_dump_database(struct btrfsic_state *state);
-static int btrfsic_test_for_metadata(struct btrfsic_state *state,
-				     char **datav, unsigned int num_pages);
-static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
-					  u64 dev_bytenr, char **mapped_datav,
-					  unsigned int num_pages,
-					  struct bio *bio, int *bio_is_patched,
-					  int submit_bio_bh_rw);
 static int btrfsic_process_written_superblock(
 		struct btrfsic_state *state,
 		struct btrfsic_block *const block,
-- 
cgit v1.2.3


From 2eadb9e75e8e65eaf3e17628e24798a3c5374f90 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 5 Jul 2021 12:29:19 +0300
Subject: btrfs: make btrfs_finish_chunk_alloc private to block-group.c

One of the final things that must be done to add a new chunk is
inserting its device extent items in the device tree. They describe
the portion of allocated device physical space during phase 1 of
chunk allocation. This is currently done in btrfs_finish_chunk_alloc
whose name isn't very informative. What's more, this function is only
used in block-group.c but is defined as public. There isn't anything
special about it that would warrant it being defined in volumes.c.

Just move btrfs_finish_chunk_alloc and alloc_chunk_dev_extent to
block-group.c, make the former static and rename both functions to
insert_dev_extents and insert_dev_extent respectively.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/volumes.c     | 92 -------------------------------------------------
 fs/btrfs/volumes.h     |  2 --
 3 files changed, 91 insertions(+), 96 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 9e7d9d0c763d..5bd76a45037e 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2244,6 +2244,95 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
 	return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
 }
 
+static int insert_dev_extent(struct btrfs_trans_handle *trans,
+			    struct btrfs_device *device, u64 chunk_offset,
+			    u64 start, u64 num_bytes)
+{
+	struct btrfs_fs_info *fs_info = device->fs_info;
+	struct btrfs_root *root = fs_info->dev_root;
+	struct btrfs_path *path;
+	struct btrfs_dev_extent *extent;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	int ret;
+
+	WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
+	WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = device->devid;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+	key.offset = start;
+	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
+	btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
+	btrfs_set_dev_extent_chunk_objectid(leaf, extent,
+					    BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
+
+	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
+	btrfs_mark_buffer_dirty(leaf);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * This function belongs to phase 2.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
+static int insert_dev_extents(struct btrfs_trans_handle *trans,
+				   u64 chunk_offset, u64 chunk_size)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_device *device;
+	struct extent_map *em;
+	struct map_lookup *map;
+	u64 dev_offset;
+	u64 stripe_size;
+	int i;
+	int ret = 0;
+
+	em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
+	if (IS_ERR(em))
+		return PTR_ERR(em);
+
+	map = em->map_lookup;
+	stripe_size = em->orig_block_len;
+
+	/*
+	 * Take the device list mutex to prevent races with the final phase of
+	 * a device replace operation that replaces the device object associated
+	 * with the map's stripes, because the device object's id can change
+	 * at any time during that final phase of the device replace operation
+	 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
+	 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
+	 * resulting in persisting a device extent item with such ID.
+	 */
+	mutex_lock(&fs_info->fs_devices->device_list_mutex);
+	for (i = 0; i < map->num_stripes; i++) {
+		device = map->stripes[i].dev;
+		dev_offset = map->stripes[i].physical;
+
+		ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
+				       stripe_size);
+		if (ret)
+			break;
+	}
+	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+	free_extent_map(em);
+	return ret;
+}
+
 /*
  * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
  * chunk allocation.
@@ -2278,8 +2367,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
 			if (ret)
 				btrfs_abort_transaction(trans, ret);
 		}
-		ret = btrfs_finish_chunk_alloc(trans, block_group->start,
-					block_group->length);
+		ret = insert_dev_extents(trans, block_group->start,
+					 block_group->length);
 		if (ret)
 			btrfs_abort_transaction(trans, ret);
 		add_block_group_free_space(trans, block_group);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 70f94b75f25a..eb734099ccba 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1759,48 +1759,6 @@ out:
 	return ret;
 }
 
-static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
-				  struct btrfs_device *device,
-				  u64 chunk_offset, u64 start, u64 num_bytes)
-{
-	int ret;
-	struct btrfs_path *path;
-	struct btrfs_fs_info *fs_info = device->fs_info;
-	struct btrfs_root *root = fs_info->dev_root;
-	struct btrfs_dev_extent *extent;
-	struct extent_buffer *leaf;
-	struct btrfs_key key;
-
-	WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
-	WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	key.objectid = device->devid;
-	key.offset = start;
-	key.type = BTRFS_DEV_EXTENT_KEY;
-	ret = btrfs_insert_empty_item(trans, root, path, &key,
-				      sizeof(*extent));
-	if (ret)
-		goto out;
-
-	leaf = path->nodes[0];
-	extent = btrfs_item_ptr(leaf, path->slots[0],
-				struct btrfs_dev_extent);
-	btrfs_set_dev_extent_chunk_tree(leaf, extent,
-					BTRFS_CHUNK_TREE_OBJECTID);
-	btrfs_set_dev_extent_chunk_objectid(leaf, extent,
-					    BTRFS_FIRST_CHUNK_TREE_OBJECTID);
-	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
-
-	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
-	btrfs_mark_buffer_dirty(leaf);
-out:
-	btrfs_free_path(path);
-	return ret;
-}
-
 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
 {
 	struct extent_map_tree *em_tree;
@@ -5463,56 +5421,6 @@ out:
 	return block_group;
 }
 
-/*
- * This function, btrfs_finish_chunk_alloc(), belongs to phase 2.
- *
- * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
- * phases.
- */
-int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
-			     u64 chunk_offset, u64 chunk_size)
-{
-	struct btrfs_fs_info *fs_info = trans->fs_info;
-	struct btrfs_device *device;
-	struct extent_map *em;
-	struct map_lookup *map;
-	u64 dev_offset;
-	u64 stripe_size;
-	int i;
-	int ret = 0;
-
-	em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
-	if (IS_ERR(em))
-		return PTR_ERR(em);
-
-	map = em->map_lookup;
-	stripe_size = em->orig_block_len;
-
-	/*
-	 * Take the device list mutex to prevent races with the final phase of
-	 * a device replace operation that replaces the device object associated
-	 * with the map's stripes, because the device object's id can change
-	 * at any time during that final phase of the device replace operation
-	 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
-	 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
-	 * resulting in persisting a device extent item with such ID.
-	 */
-	mutex_lock(&fs_info->fs_devices->device_list_mutex);
-	for (i = 0; i < map->num_stripes; i++) {
-		device = map->stripes[i].dev;
-		dev_offset = map->stripes[i].physical;
-
-		ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
-					     dev_offset, stripe_size);
-		if (ret)
-			break;
-	}
-	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-
-	free_extent_map(em);
-	return ret;
-}
-
 /*
  * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
  * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 55a8ba244716..70c749eee3ad 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -508,8 +508,6 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
 			   u64 logical, u64 len);
 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
 				    u64 logical);
-int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
-			     u64 chunk_offset, u64 chunk_size);
 int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
 				     struct btrfs_block_group *bg);
 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
-- 
cgit v1.2.3


From 67d5e289a193c643a70ceda437c625e2bc876dbc Mon Sep 17 00:00:00 2001
From: Marcos Paulo de Souza <mpdesouza@suse.com>
Date: Tue, 6 Jul 2021 15:13:25 -0300
Subject: btrfs: remove max argument from generic_bin_search

Both callers use btrfs_header_nritems to feed the max argument.  Remove
the argument and let generic_bin_search call it itself.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c5c08c87e130..c212f1218fdd 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -726,21 +726,21 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 
 /*
  * search for key in the extent_buffer.  The items start at offset p,
- * and they are item_size apart.  There are 'max' items in p.
+ * and they are item_size apart.
  *
  * the slot in the array is returned via slot, and it points to
  * the place where you would insert key if it is not found in
  * the array.
  *
- * slot may point to max if the key is bigger than all of the keys
+ * Slot may point to total number of items if the key is bigger than
+ * all of the keys
  */
 static noinline int generic_bin_search(struct extent_buffer *eb,
 				       unsigned long p, int item_size,
-				       const struct btrfs_key *key,
-				       int max, int *slot)
+				       const struct btrfs_key *key, int *slot)
 {
 	int low = 0;
-	int high = max;
+	int high = btrfs_header_nritems(eb);
 	int ret;
 	const int key_size = sizeof(struct btrfs_disk_key);
 
@@ -799,15 +799,11 @@ int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
 	if (btrfs_header_level(eb) == 0)
 		return generic_bin_search(eb,
 					  offsetof(struct btrfs_leaf, items),
-					  sizeof(struct btrfs_item),
-					  key, btrfs_header_nritems(eb),
-					  slot);
+					  sizeof(struct btrfs_item), key, slot);
 	else
 		return generic_bin_search(eb,
 					  offsetof(struct btrfs_node, ptrs),
-					  sizeof(struct btrfs_key_ptr),
-					  key, btrfs_header_nritems(eb),
-					  slot);
+					  sizeof(struct btrfs_key_ptr), key, slot);
 }
 
 static void root_add_used(struct btrfs_root *root, u32 size)
-- 
cgit v1.2.3


From 23608d51a3b2a0e1e884eba7b1d1eadefe4aadcc Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Sun, 4 Jul 2021 20:04:57 +0800
Subject: btrfs: cleanup fs_devices pointer usage in btrfs_trim_fs

Drop variable 'devices' (used only once) and add new variable for
the fs_devices, so it is used at two locations within btrfs_trim_fs()
function and also helps to access fs_devices->devices.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 268ce58d4569..d5925bebd379 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5950,9 +5950,9 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
  */
 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
 {
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	struct btrfs_block_group *cache = NULL;
 	struct btrfs_device *device;
-	struct list_head *devices;
 	u64 group_trimmed;
 	u64 range_end = U64_MAX;
 	u64 start;
@@ -6016,9 +6016,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
 		btrfs_warn(fs_info,
 			"failed to trim %llu block group(s), last error %d",
 			bg_failed, bg_ret);
-	mutex_lock(&fs_info->fs_devices->device_list_mutex);
-	devices = &fs_info->fs_devices->devices;
-	list_for_each_entry(device, devices, dev_list) {
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
 		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
 			continue;
 
@@ -6031,7 +6031,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
 
 		trimmed += group_trimmed;
 	}
-	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+	mutex_unlock(&fs_devices->device_list_mutex);
 
 	if (dev_failed)
 		btrfs_warn(fs_info,
-- 
cgit v1.2.3


From b0ee5e1ec44afda53aaa37f8c41ad00d170506cb Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 14 Jun 2021 22:22:22 +0200
Subject: btrfs: drop from __GFP_HIGHMEM all allocations

The highmem flag is used for allocating pages for compression and for
raid56 pages. The high memory makes sense on 32bit systems but is not
without problems. On 64bit system's it's just another layer of wrappers.

The time the pages are allocated for compression or raid56 is relatively
short (about a transaction commit), so the pages are not blocked
indefinitely. As the number of pages depends on the amount of data being
written/read, there's a theoretical problem. A fast device on a 32bit
system could use most of the low memory pool, while with the highmem
allocation that would not happen. This was possibly the original idea
long time ago, but nowadays we optimize for 64bit systems.

This patch removes all usage of the __GFP_HIGHMEM flag for page
allocation, the kmap/kunmap are still in place and will be removed in
followup patches. Remaining is masking out the bit in
alloc_extent_state and __lookup_free_space_inode, that can safely stay.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c |  3 +--
 fs/btrfs/lzo.c         |  4 ++--
 fs/btrfs/raid56.c      | 10 +++++-----
 fs/btrfs/zlib.c        |  6 +++---
 fs/btrfs/zstd.c        |  6 +++---
 5 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 30d82cdf128c..49fdec423092 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -721,8 +721,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 		goto fail1;
 
 	for (pg_index = 0; pg_index < nr_pages; pg_index++) {
-		cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
-							      __GFP_HIGHMEM);
+		cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS);
 		if (!cb->compressed_pages[pg_index]) {
 			faili = pg_index - 1;
 			ret = BLK_STS_RESOURCE;
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index cd042c7567a4..2bebb60c5830 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -146,7 +146,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
 	 * store the size of all chunks of compressed data in
 	 * the first 4 bytes
 	 */
-	out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	out_page = alloc_page(GFP_NOFS);
 	if (out_page == NULL) {
 		ret = -ENOMEM;
 		goto out;
@@ -216,7 +216,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
 					goto out;
 				}
 
-				out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+				out_page = alloc_page(GFP_NOFS);
 				if (out_page == NULL) {
 					ret = -ENOMEM;
 					goto out;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 244d499ebc72..a40a45a007d4 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1035,7 +1035,7 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
 	for (i = 0; i < rbio->nr_pages; i++) {
 		if (rbio->stripe_pages[i])
 			continue;
-		page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+		page = alloc_page(GFP_NOFS);
 		if (!page)
 			return -ENOMEM;
 		rbio->stripe_pages[i] = page;
@@ -1054,7 +1054,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
 	for (; i < rbio->nr_pages; i++) {
 		if (rbio->stripe_pages[i])
 			continue;
-		page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+		page = alloc_page(GFP_NOFS);
 		if (!page)
 			return -ENOMEM;
 		rbio->stripe_pages[i] = page;
@@ -2300,7 +2300,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
 			if (rbio->stripe_pages[index])
 				continue;
 
-			page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+			page = alloc_page(GFP_NOFS);
 			if (!page)
 				return -ENOMEM;
 			rbio->stripe_pages[index] = page;
@@ -2350,14 +2350,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 	if (!need_check)
 		goto writeback;
 
-	p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	p_page = alloc_page(GFP_NOFS);
 	if (!p_page)
 		goto cleanup;
 	SetPageUptodate(p_page);
 
 	if (has_qstripe) {
 		/* RAID6, allocate and map temp space for the Q stripe */
-		q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+		q_page = alloc_page(GFP_NOFS);
 		if (!q_page) {
 			__free_page(p_page);
 			goto cleanup;
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index c3fa7d3fa770..2c792bc5a987 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -121,7 +121,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 	workspace->strm.total_in = 0;
 	workspace->strm.total_out = 0;
 
-	out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	out_page = alloc_page(GFP_NOFS);
 	if (out_page == NULL) {
 		ret = -ENOMEM;
 		goto out;
@@ -202,7 +202,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 				ret = -E2BIG;
 				goto out;
 			}
-			out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+			out_page = alloc_page(GFP_NOFS);
 			if (out_page == NULL) {
 				ret = -ENOMEM;
 				goto out;
@@ -240,7 +240,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 				ret = -E2BIG;
 				goto out;
 			}
-			out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+			out_page = alloc_page(GFP_NOFS);
 			if (out_page == NULL) {
 				ret = -ENOMEM;
 				goto out;
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 3e26b466476a..9451d2bb984e 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -405,7 +405,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 
 
 	/* Allocate and map in the output buffer */
-	out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	out_page = alloc_page(GFP_NOFS);
 	if (out_page == NULL) {
 		ret = -ENOMEM;
 		goto out;
@@ -452,7 +452,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 				ret = -E2BIG;
 				goto out;
 			}
-			out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+			out_page = alloc_page(GFP_NOFS);
 			if (out_page == NULL) {
 				ret = -ENOMEM;
 				goto out;
@@ -512,7 +512,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 			ret = -E2BIG;
 			goto out;
 		}
-		out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+		out_page = alloc_page(GFP_NOFS);
 		if (out_page == NULL) {
 			ret = -ENOMEM;
 			goto out;
-- 
cgit v1.2.3


From 8c945d32e60427cbc0859cf7045bbe6196bb03d8 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 14 Jun 2021 22:25:53 +0200
Subject: btrfs: compression: drop kmap/kunmap from lzo

As we don't use highmem pages anymore, drop the kmap/kunmap. The kmap is
simply page_address and kunmap is a no-op.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/lzo.c | 38 ++++++++++----------------------------
 1 file changed, 10 insertions(+), 28 deletions(-)

diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 2bebb60c5830..576a0e6142ad 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -140,7 +140,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
 	*total_in = 0;
 
 	in_page = find_get_page(mapping, start >> PAGE_SHIFT);
-	data_in = kmap(in_page);
+	data_in = page_address(in_page);
 
 	/*
 	 * store the size of all chunks of compressed data in
@@ -151,7 +151,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
 		ret = -ENOMEM;
 		goto out;
 	}
-	cpage_out = kmap(out_page);
+	cpage_out = page_address(out_page);
 	out_offset = LZO_LEN;
 	tot_out = LZO_LEN;
 	pages[0] = out_page;
@@ -209,7 +209,6 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
 				if (out_len == 0 && tot_in >= len)
 					break;
 
-				kunmap(out_page);
 				if (nr_pages == nr_dest_pages) {
 					out_page = NULL;
 					ret = -E2BIG;
@@ -221,7 +220,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
 					ret = -ENOMEM;
 					goto out;
 				}
-				cpage_out = kmap(out_page);
+				cpage_out = page_address(out_page);
 				pages[nr_pages++] = out_page;
 
 				pg_bytes_left = PAGE_SIZE;
@@ -243,12 +242,11 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
 			break;
 
 		bytes_left = len - tot_in;
-		kunmap(in_page);
 		put_page(in_page);
 
 		start += PAGE_SIZE;
 		in_page = find_get_page(mapping, start >> PAGE_SHIFT);
-		data_in = kmap(in_page);
+		data_in = page_address(in_page);
 		in_len = min(bytes_left, PAGE_SIZE);
 	}
 
@@ -258,22 +256,17 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
 	}
 
 	/* store the size of all chunks of compressed data */
-	sizes_ptr = kmap_local_page(pages[0]);
+	sizes_ptr = page_address(pages[0]);
 	write_compress_length(sizes_ptr, tot_out);
-	kunmap_local(sizes_ptr);
 
 	ret = 0;
 	*total_out = tot_out;
 	*total_in = tot_in;
 out:
 	*out_pages = nr_pages;
-	if (out_page)
-		kunmap(out_page);
 
-	if (in_page) {
-		kunmap(in_page);
+	if (in_page)
 		put_page(in_page);
-	}
 
 	return ret;
 }
@@ -299,12 +292,11 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 	unsigned long tot_out;
 	unsigned long tot_len;
 	char *buf;
-	bool may_late_unmap, need_unmap;
 	struct page **pages_in = cb->compressed_pages;
 	u64 disk_start = cb->start;
 	struct bio *orig_bio = cb->orig_bio;
 
-	data_in = kmap(pages_in[0]);
+	data_in = page_address(pages_in[0]);
 	tot_len = read_compress_length(data_in);
 	/*
 	 * Compressed data header check.
@@ -345,13 +337,11 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 
 		tot_in += in_len;
 		working_bytes = in_len;
-		may_late_unmap = need_unmap = false;
 
 		/* fast path: avoid using the working buffer */
 		if (in_page_bytes_left >= in_len) {
 			buf = data_in + in_offset;
 			bytes = in_len;
-			may_late_unmap = true;
 			goto cont;
 		}
 
@@ -381,12 +371,8 @@ cont:
 					goto done;
 				}
 
-				if (may_late_unmap)
-					need_unmap = true;
-				else
-					kunmap(pages_in[page_in_index]);
-
-				data_in = kmap(pages_in[++page_in_index]);
+				page_in_index++;
+				data_in = page_address(pages_in[page_in_index]);
 
 				in_page_bytes_left = PAGE_SIZE;
 				in_offset = 0;
@@ -396,8 +382,6 @@ cont:
 		out_len = max_segment_len;
 		ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
 					    &out_len);
-		if (need_unmap)
-			kunmap(pages_in[page_in_index - 1]);
 		if (ret != LZO_E_OK) {
 			pr_warn("BTRFS: decompress failed\n");
 			ret = -EIO;
@@ -413,7 +397,6 @@ cont:
 			break;
 	}
 done:
-	kunmap(pages_in[page_in_index]);
 	if (!ret)
 		zero_fill_bio(orig_bio);
 	return ret;
@@ -466,7 +449,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
 	destlen = min_t(unsigned long, destlen, PAGE_SIZE);
 	bytes = min_t(unsigned long, destlen, out_len - start_byte);
 
-	kaddr = kmap_local_page(dest_page);
+	kaddr = page_address(dest_page);
 	memcpy(kaddr, workspace->buf + start_byte, bytes);
 
 	/*
@@ -476,7 +459,6 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
 	 */
 	if (bytes < destlen)
 		memset(kaddr+bytes, 0, destlen-bytes);
-	kunmap_local(kaddr);
 out:
 	return ret;
 }
-- 
cgit v1.2.3


From 696ab562e6df9fbafd6052d8ce4aafcb2ed16069 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 15 Jun 2021 01:00:05 +0200
Subject: btrfs: compression: drop kmap/kunmap from zlib

As we don't use highmem pages anymore, drop the kmap/kunmap. The kmap is
simply page_address and kunmap is a no-op.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/zlib.c | 36 +++++++++++-------------------------
 1 file changed, 11 insertions(+), 25 deletions(-)

diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 2c792bc5a987..5e18d7ad75a4 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -126,7 +126,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 		ret = -ENOMEM;
 		goto out;
 	}
-	cpage_out = kmap(out_page);
+	cpage_out = page_address(out_page);
 	pages[0] = out_page;
 	nr_pages = 1;
 
@@ -148,26 +148,22 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 				int i;
 
 				for (i = 0; i < in_buf_pages; i++) {
-					if (in_page) {
-						kunmap(in_page);
+					if (in_page)
 						put_page(in_page);
-					}
 					in_page = find_get_page(mapping,
 								start >> PAGE_SHIFT);
-					data_in = kmap(in_page);
+					data_in = page_address(in_page);
 					memcpy(workspace->buf + i * PAGE_SIZE,
 					       data_in, PAGE_SIZE);
 					start += PAGE_SIZE;
 				}
 				workspace->strm.next_in = workspace->buf;
 			} else {
-				if (in_page) {
-					kunmap(in_page);
+				if (in_page)
 					put_page(in_page);
-				}
 				in_page = find_get_page(mapping,
 							start >> PAGE_SHIFT);
-				data_in = kmap(in_page);
+				data_in = page_address(in_page);
 				start += PAGE_SIZE;
 				workspace->strm.next_in = data_in;
 			}
@@ -196,7 +192,6 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 		 * the stream end if required
 		 */
 		if (workspace->strm.avail_out == 0) {
-			kunmap(out_page);
 			if (nr_pages == nr_dest_pages) {
 				out_page = NULL;
 				ret = -E2BIG;
@@ -207,7 +202,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 				ret = -ENOMEM;
 				goto out;
 			}
-			cpage_out = kmap(out_page);
+			cpage_out = page_address(out_page);
 			pages[nr_pages] = out_page;
 			nr_pages++;
 			workspace->strm.avail_out = PAGE_SIZE;
@@ -234,7 +229,6 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 			goto out;
 		} else if (workspace->strm.avail_out == 0) {
 			/* get another page for the stream end */
-			kunmap(out_page);
 			if (nr_pages == nr_dest_pages) {
 				out_page = NULL;
 				ret = -E2BIG;
@@ -245,7 +239,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 				ret = -ENOMEM;
 				goto out;
 			}
-			cpage_out = kmap(out_page);
+			cpage_out = page_address(out_page);
 			pages[nr_pages] = out_page;
 			nr_pages++;
 			workspace->strm.avail_out = PAGE_SIZE;
@@ -264,13 +258,8 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 	*total_in = workspace->strm.total_in;
 out:
 	*out_pages = nr_pages;
-	if (out_page)
-		kunmap(out_page);
-
-	if (in_page) {
-		kunmap(in_page);
+	if (in_page)
 		put_page(in_page);
-	}
 	return ret;
 }
 
@@ -289,7 +278,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 	u64 disk_start = cb->start;
 	struct bio *orig_bio = cb->orig_bio;
 
-	data_in = kmap(pages_in[page_in_index]);
+	data_in = page_address(pages_in[page_in_index]);
 	workspace->strm.next_in = data_in;
 	workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
 	workspace->strm.total_in = 0;
@@ -311,7 +300,6 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 
 	if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
 		pr_warn("BTRFS: inflateInit failed\n");
-		kunmap(pages_in[page_in_index]);
 		return -EIO;
 	}
 	while (workspace->strm.total_in < srclen) {
@@ -339,13 +327,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 
 		if (workspace->strm.avail_in == 0) {
 			unsigned long tmp;
-			kunmap(pages_in[page_in_index]);
+
 			page_in_index++;
 			if (page_in_index >= total_pages_in) {
 				data_in = NULL;
 				break;
 			}
-			data_in = kmap(pages_in[page_in_index]);
+			data_in = page_address(pages_in[page_in_index]);
 			workspace->strm.next_in = data_in;
 			tmp = srclen - workspace->strm.total_in;
 			workspace->strm.avail_in = min(tmp,
@@ -358,8 +346,6 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 		ret = 0;
 done:
 	zlib_inflateEnd(&workspace->strm);
-	if (data_in)
-		kunmap(pages_in[page_in_index]);
 	if (!ret)
 		zero_fill_bio(orig_bio);
 	return ret;
-- 
cgit v1.2.3


From bbaf9715f3f5b5ff0de71da91fcc34ee9c198ed8 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 15 Jun 2021 01:00:05 +0200
Subject: btrfs: compression: drop kmap/kunmap from zstd

As we don't use highmem pages anymore, drop the kmap/kunmap. The kmap is
simply page_address and kunmap is a no-op.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/zstd.c | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 9451d2bb984e..200ba08bfae6 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -399,7 +399,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 
 	/* map in the first page of input data */
 	in_page = find_get_page(mapping, start >> PAGE_SHIFT);
-	workspace->in_buf.src = kmap(in_page);
+	workspace->in_buf.src = page_address(in_page);
 	workspace->in_buf.pos = 0;
 	workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
 
@@ -411,7 +411,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 		goto out;
 	}
 	pages[nr_pages++] = out_page;
-	workspace->out_buf.dst = kmap(out_page);
+	workspace->out_buf.dst = page_address(out_page);
 	workspace->out_buf.pos = 0;
 	workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
 
@@ -446,7 +446,6 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 		if (workspace->out_buf.pos == workspace->out_buf.size) {
 			tot_out += PAGE_SIZE;
 			max_out -= PAGE_SIZE;
-			kunmap(out_page);
 			if (nr_pages == nr_dest_pages) {
 				out_page = NULL;
 				ret = -E2BIG;
@@ -458,7 +457,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 				goto out;
 			}
 			pages[nr_pages++] = out_page;
-			workspace->out_buf.dst = kmap(out_page);
+			workspace->out_buf.dst = page_address(out_page);
 			workspace->out_buf.pos = 0;
 			workspace->out_buf.size = min_t(size_t, max_out,
 							PAGE_SIZE);
@@ -473,13 +472,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 		/* Check if we need more input */
 		if (workspace->in_buf.pos == workspace->in_buf.size) {
 			tot_in += PAGE_SIZE;
-			kunmap(in_page);
 			put_page(in_page);
 
 			start += PAGE_SIZE;
 			len -= PAGE_SIZE;
 			in_page = find_get_page(mapping, start >> PAGE_SHIFT);
-			workspace->in_buf.src = kmap(in_page);
+			workspace->in_buf.src = page_address(in_page);
 			workspace->in_buf.pos = 0;
 			workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
 		}
@@ -506,7 +504,6 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 
 		tot_out += PAGE_SIZE;
 		max_out -= PAGE_SIZE;
-		kunmap(out_page);
 		if (nr_pages == nr_dest_pages) {
 			out_page = NULL;
 			ret = -E2BIG;
@@ -518,7 +515,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 			goto out;
 		}
 		pages[nr_pages++] = out_page;
-		workspace->out_buf.dst = kmap(out_page);
+		workspace->out_buf.dst = page_address(out_page);
 		workspace->out_buf.pos = 0;
 		workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
 	}
@@ -534,12 +531,8 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 out:
 	*out_pages = nr_pages;
 	/* Cleanup */
-	if (in_page) {
-		kunmap(in_page);
+	if (in_page)
 		put_page(in_page);
-	}
-	if (out_page)
-		kunmap(out_page);
 	return ret;
 }
 
@@ -565,7 +558,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 		goto done;
 	}
 
-	workspace->in_buf.src = kmap(pages_in[page_in_index]);
+	workspace->in_buf.src = page_address(pages_in[page_in_index]);
 	workspace->in_buf.pos = 0;
 	workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
 
@@ -601,14 +594,14 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 			break;
 
 		if (workspace->in_buf.pos == workspace->in_buf.size) {
-			kunmap(pages_in[page_in_index++]);
+			page_in_index++;
 			if (page_in_index >= total_pages_in) {
 				workspace->in_buf.src = NULL;
 				ret = -EIO;
 				goto done;
 			}
 			srclen -= PAGE_SIZE;
-			workspace->in_buf.src = kmap(pages_in[page_in_index]);
+			workspace->in_buf.src = page_address(pages_in[page_in_index]);
 			workspace->in_buf.pos = 0;
 			workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
 		}
@@ -616,8 +609,6 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 	ret = 0;
 	zero_fill_bio(orig_bio);
 done:
-	if (workspace->in_buf.src)
-		kunmap(pages_in[page_in_index]);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 4c2bf276b56d8d27ddbafcdf056ef3fc60ae50b0 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 15 Jun 2021 17:15:38 +0200
Subject: btrfs: compression: drop kmap/kunmap from generic helpers

The pages in compressed_pages are not from highmem anymore so we can
drop the mapping for checksum calculation and inline extent.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 3 +--
 fs/btrfs/inode.c       | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 49fdec423092..aeda426b6121 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -172,10 +172,9 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
 		/* Hash through the page sector by sector */
 		for (pg_offset = 0; pg_offset < bytes_left;
 		     pg_offset += sectorsize) {
-			kaddr = kmap_atomic(page);
+			kaddr = page_address(page);
 			crypto_shash_digest(shash, kaddr + pg_offset,
 					    sectorsize, csum);
-			kunmap_atomic(kaddr);
 
 			if (memcmp(&csum, cb_sum, csum_size) != 0) {
 				btrfs_print_data_csum_error(inode, disk_start,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 06f9f167222b..c84d42a290c6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -286,9 +286,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 			cur_size = min_t(unsigned long, compressed_size,
 				       PAGE_SIZE);
 
-			kaddr = kmap_atomic(cpage);
+			kaddr = page_address(cpage);
 			write_extent_buffer(leaf, kaddr, ptr, cur_size);
-			kunmap_atomic(kaddr);
 
 			i++;
 			ptr += cur_size;
-- 
cgit v1.2.3


From 5da384799278afe0d2557e4d4482240840c208b8 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 15 Jun 2021 17:15:38 +0200
Subject: btrfs: check-integrity: drop kmap/kunmap for block pages

The pages in block_ctx have never been allocated from highmem (in
btrfsic_read_block) so the mapping is pointless and can be removed.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/check-integrity.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 9cd88dfc5f8a..86816088927f 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1509,10 +1509,8 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
 		/* Pages must be unmapped in reverse order */
 		while (num_pages > 0) {
 			num_pages--;
-			if (block_ctx->datav[num_pages]) {
-				kunmap_local(block_ctx->datav[num_pages]);
+			if (block_ctx->datav[num_pages])
 				block_ctx->datav[num_pages] = NULL;
-			}
 			if (block_ctx->pagev[num_pages]) {
 				__free_page(block_ctx->pagev[num_pages]);
 				block_ctx->pagev[num_pages] = NULL;
@@ -1589,7 +1587,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
 		i = j;
 	}
 	for (i = 0; i < num_pages; i++)
-		block_ctx->datav[i] = kmap_local_page(block_ctx->pagev[i]);
+		block_ctx->datav[i] = page_address(block_ctx->pagev[i]);
 
 	return block_ctx->len;
 }
@@ -2654,7 +2652,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
 
 		bio_for_each_segment(bvec, bio, iter) {
 			BUG_ON(bvec.bv_len != PAGE_SIZE);
-			mapped_datav[i] = kmap_local_page(bvec.bv_page);
+			mapped_datav[i] = page_address(bvec.bv_page);
 			i++;
 
 			if (dev_state->state->print_mask &
@@ -2667,9 +2665,6 @@ static void __btrfsic_submit_bio(struct bio *bio)
 					      mapped_datav, segs,
 					      bio, &bio_is_patched,
 					      bio->bi_opf);
-		/* Unmap in reverse order */
-		for (--i; i >= 0; i--)
-			kunmap_local(mapped_datav[i]);
 		kfree(mapped_datav);
 	} else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
 		if (dev_state->state->print_mask &
-- 
cgit v1.2.3


From 069a2e37789a9adb236d8f7a5f65a1390b51f184 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 20 Jul 2021 16:03:03 +0100
Subject: btrfs: continue readahead of siblings even if target node is in
 memory

At reada_for_search(), when attempting to readahead a node or leaf's
siblings, we skip the readahead of the siblings if the node/leaf is
already in memory. That is probably fine for the READA_FORWARD and
READA_BACK readahead types, as they are used on contexts where we
end up reading some consecutive leaves, but usually not the whole btree.

However for a READA_FORWARD_ALWAYS mode, currently only used for full
send operations, it does not make sense to skip the readahead if the
target node or leaf is already loaded in memory, since we know the caller
is visiting every node and leaf of the btree in ascending order.

So change the behaviour to not skip the readahead when the target node is
already in memory and the readahead mode is READA_FORWARD_ALWAYS.

The following test script was used to measure the improvement on a box
using an average, consumer grade, spinning disk, with 32GiB of RAM and
using a non-debug kernel config (Debian's default config).

  $ cat test.sh
  #!/bin/bash

  DEV=/dev/sdj
  MNT=/mnt/sdj
  MKFS_OPTIONS="--nodesize 16384"     # default, just to be explicit
  MOUNT_OPTIONS="-o max_inline=2048"  # default, just to be explicit

  mkfs.btrfs -f $MKFS_OPTIONS $DEV > /dev/null
  mount $MOUNT_OPTIONS $DEV $MNT

  # Create files with inline data to make it easier and faster to create
  # large btrees.
  add_files()
  {
      local total=$1
      local start_offset=$2
      local number_jobs=$3
      local total_per_job=$(($total / $number_jobs))

      echo "Creating $total new files using $number_jobs jobs"
      for ((n = 0; n < $number_jobs; n++)); do
          (
              local start_num=$(($start_offset + $n * $total_per_job))
              for ((i = 1; i <= $total_per_job; i++)); do
                  local file_num=$((start_num + $i))
                  local file_path="$MNT/file_${file_num}"
                  xfs_io -f -c "pwrite -S 0xab 0 2000" $file_path > /dev/null
                  if [ $? -ne 0 ]; then
                      echo "Failed creating file $file_path"
                      break
                  fi
              done
          ) &
          worker_pids[$n]=$!
      done

      wait ${worker_pids[@]}

      sync
      echo
      echo "btree node/leaf count: $(btrfs inspect-internal dump-tree -t 5 $DEV | egrep '^(node|leaf) ' | wc -l)"
  }

  file_count=2000000
  add_files $file_count 0 4

  echo
  echo "Creating snapshot..."
  btrfs subvolume snapshot -r $MNT $MNT/snap1

  umount $MNT

  echo 3 > /proc/sys/vm/drop_caches
  blockdev --flushbufs $DEV &> /dev/null
  hdparm -F $DEV &> /dev/null

  mount $MOUNT_OPTIONS $DEV $MNT

  echo
  echo "Testing full send..."
  start=$(date +%s)
  btrfs send $MNT/snap1 > /dev/null
  end=$(date +%s)
  echo
  echo "Full send took $((end - start)) seconds"

  umount $MNT

The duration of the full send operations, in seconds, were the following:

Before this change:  85 seconds
After this change:   76 seconds (-11.2%)

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c212f1218fdd..63c026495193 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1233,7 +1233,6 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
 	u64 target;
 	u64 nread = 0;
 	u64 nread_max;
-	struct extent_buffer *eb;
 	u32 nr;
 	u32 blocksize;
 	u32 nscan = 0;
@@ -1262,10 +1261,14 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
 
 	search = btrfs_node_blockptr(node, slot);
 	blocksize = fs_info->nodesize;
-	eb = find_extent_buffer(fs_info, search);
-	if (eb) {
-		free_extent_buffer(eb);
-		return;
+	if (path->reada != READA_FORWARD_ALWAYS) {
+		struct extent_buffer *eb;
+
+		eb = find_extent_buffer(fs_info, search);
+		if (eb) {
+			free_extent_buffer(eb);
+			return;
+		}
 	}
 
 	target = search;
-- 
cgit v1.2.3


From 6534c0c99dddafc47bd4152949751ccd6a5681fc Mon Sep 17 00:00:00 2001
From: Marcos Paulo de Souza <mpdesouza@suse.com>
Date: Tue, 20 Jul 2021 15:02:47 -0300
Subject: btrfs: pass NULL as trans to btrfs_search_slot if we only want to
 search

Using a transaction in btrfs_search_slot is only useful when we are
searching to add or modify the tree. When the function is used for
searching, insert length and mod arguments are 0, there is no need to
use a transaction.

No functional changes, changing for consistency.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c     | 2 +-
 fs/btrfs/extent-tree.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 78b202d198b8..728c728d03f3 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1211,7 +1211,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 again:
 	head = NULL;
 
-	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
+	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
 	BUG_ON(ret == 0);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d5925bebd379..fc3da7585fb7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -153,7 +153,7 @@ search_again:
 	else
 		key.type = BTRFS_EXTENT_ITEM_KEY;
 
-	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
+	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out_free;
 
-- 
cgit v1.2.3


From 2b29726c473b3f7d1b8f22d138ed12b2776bb5d2 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 19 Jul 2021 13:43:04 +0800
Subject: btrfs: rescue: allow ibadroots to skip bad extent tree when reading
 block group items

When extent tree gets corrupted, normally it's not extent tree root, but
one toasted tree leaf/node.

In that case, rescue=ibadroots mount option won't help as it can only
handle the extent tree root corruption.

This patch will enhance the behavior by:

- Allow fill_dummy_bgs() to ignore -EEXIST error

  This means we may have some block group items read from disk, but
  then hit some error halfway.

- Fallback to fill_dummy_bgs() if any error gets hit in
  btrfs_read_block_groups()

  Of course, this still needs rescue=ibadroots mount option.

With that, rescue=ibadroots can handle extent tree corruption more
gracefully and allow a better recover chance.

Reported-by: Zhenyu Wu <wuzy001@gmail.com>
Link: https://www.spinics.net/lists/linux-btrfs/msg114424.html
Reviewed-by: Su Yue <l@damenly.su>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 5bd76a45037e..d5421ee0d366 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2105,11 +2105,22 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
 		bg->used = em->len;
 		bg->flags = map->type;
 		ret = btrfs_add_block_group_cache(fs_info, bg);
+		/*
+		 * We may have some valid block group cache added already, in
+		 * that case we skip to the next one.
+		 */
+		if (ret == -EEXIST) {
+			ret = 0;
+			btrfs_put_block_group(bg);
+			continue;
+		}
+
 		if (ret) {
 			btrfs_remove_free_space_cache(bg);
 			btrfs_put_block_group(bg);
 			break;
 		}
+
 		btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
 					0, 0, &space_info);
 		bg->space_info = space_info;
@@ -2212,6 +2223,14 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
 	ret = check_chunk_block_group_mappings(info);
 error:
 	btrfs_free_path(path);
+	/*
+	 * We've hit some error while reading the extent tree, and have
+	 * rescue=ibadroots mount option.
+	 * Try to fill the tree using dummy block groups so that the user can
+	 * continue to mount and grab their data.
+	 */
+	if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
+		ret = fill_dummy_bgs(info);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 506650dcb3a716ad98681f7091ba2f8e748c04b8 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 20 Jul 2021 16:05:22 +0100
Subject: btrfs: improve the batch insertion of delayed items

When we insert the delayed items of an inode, which corresponds to the
directory index keys for a directory (key type BTRFS_DIR_INDEX_KEY), we
do the following:

1) Pick the first delayed item from the rbtree and insert it into the
   fs/subvolume btree, using btrfs_insert_empty_item() for that;

2) Without releasing the path returned by btrfs_insert_empty_item(),
   keep collecting as many consecutive delayed items from the rbtree
   as possible, as long as each one's BTRFS_DIR_INDEX_KEY key is the
   immediate successor of the previously picked item and as long as
   they fit in the available space of the leaf the path points to;

3) Then insert all the collected items into the leaf;

4) Release the reserve metadata space for each collected item and
   release each item (implies deleting from the rbtree);

5) Unlock the path.

While this is much better than inserting items one by one, it can be
improved in a few aspects:

1) Instead of adding items based on the remaining free space of the
   leaf, collect as many items that can fit in a leaf and bulk insert
   them. This results in less and larger batches, reducing the total
   amount of time to insert the delayed items. For example when adding
   100K files to a directory, we ended up creating 1658 batches with
   very variable sizes ranging from 1 item to 118 items, on a filesystem
   with a node/leaf size of 16K. After this change, we end up with 839
   batches, with the vast majority of them having exactly 120 items;

2) We do the search for more items to batch, by iterating the rbtree,
   while holding a write lock on the leaf;

3) While still holding the leaf locked, we are releasing the reserved
   metadata for each item and then deleting each item, keeping a write
   lock on the leaf for longer than necessary. Releasing the delayed items
   one by one can take a significant amount of time, because deleting
   them from the rbtree can often be a bit slow when the deletion results
   in rebalancing the rbtree.

So change this so that we try to create larger batches, with a total
item size up to the maximum a leaf can support, and by unlocking the leaf
immediately after inserting the items, releasing the reserved metadata
space of each item and releasing each item without holding the write lock
on the leaf.

The following script that runs fs_mark was used to test this change:

  $ cat test.sh
  #!/bin/bash

  DEV=/dev/nvme0n1
  MNT=/mnt/nvme0n1
  MOUNT_OPTIONS="-o ssd"
  MKFS_OPTIONS="-m single -d single"
  FILES=1000000
  THREADS=16
  FILE_SIZE=0

  echo "performance" | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor

  umount $DEV &> /dev/null
  mkfs.btrfs -f $MKFS_OPTIONS $DEV
  mount $MOUNT_OPTIONS $DEV $MNT

  OPTS="-S 0 -L 5 -n $FILES -s $FILE_SIZE -t 16"
  for ((i = 1; i <= $THREADS; i++)); do
      OPTS="$OPTS -d $MNT/d$i"
  done

  fs_mark $OPTS

  umount $MNT

It was run on machine with 12 cores, 64G of ram, using a NVMe device and
using a non-debug kernel config (Debian's default config).

Results before this change:

FSUse%        Count         Size    Files/sec         App Overhead
     1     16000000            0      76182.1             72223046
     3     32000000            0      62746.9             80776528
     5     48000000            0      77029.0             93022381
     6     64000000            0      73691.6             95251075
     8     80000000            0      66288.0             85089634

Results after this change:

FSUse%        Count         Size    Files/sec         App Overhead
     1     16000000            0      79049.5 (+3.7%)     69700824
     3     32000000            0      65248.9 (+3.9%)     80583693
     5     48000000            0      77991.4 (+1.2%)     90040908
     6     64000000            0      75096.8 (+1.9%)     89862241
     8     80000000            0      66926.8 (+1.0%)     84429169

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-inode.c | 212 ++++++++++++++++++-----------------------------
 1 file changed, 79 insertions(+), 133 deletions(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 257c1e18abd4..20f3e748027e 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -672,176 +672,122 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
 }
 
 /*
- * This helper will insert some continuous items into the same leaf according
- * to the free space of the leaf.
+ * Insert a single delayed item or a batch of delayed items that have consecutive
+ * keys if they exist.
  */
-static int btrfs_batch_insert_items(struct btrfs_root *root,
-				    struct btrfs_path *path,
-				    struct btrfs_delayed_item *item)
+static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     struct btrfs_path *path,
+				     struct btrfs_delayed_item *first_item)
 {
-	struct btrfs_delayed_item *curr, *next;
-	int free_space;
-	int total_size = 0;
-	struct extent_buffer *leaf;
-	char *data_ptr;
-	struct btrfs_key *keys;
-	u32 *data_size;
-	struct list_head head;
-	int slot;
+	LIST_HEAD(batch);
+	struct btrfs_delayed_item *curr;
+	struct btrfs_delayed_item *next;
+	const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info);
+	int total_size;
 	int nitems;
-	int i;
-	int ret = 0;
-
-	BUG_ON(!path->nodes[0]);
-
-	leaf = path->nodes[0];
-	free_space = btrfs_leaf_free_space(leaf);
-	INIT_LIST_HEAD(&head);
+	unsigned int nofs_flag;
+	char *ins_data = NULL;
+	struct btrfs_key *ins_keys;
+	u32 *ins_sizes;
+	int ret;
 
-	next = item;
-	nitems = 0;
+	list_add_tail(&first_item->tree_list, &batch);
+	nitems = 1;
+	total_size = first_item->data_len + sizeof(struct btrfs_item);
+	curr = first_item;
 
-	/*
-	 * count the number of the continuous items that we can insert in batch
-	 */
-	while (total_size + next->data_len + sizeof(struct btrfs_item) <=
-	       free_space) {
-		total_size += next->data_len + sizeof(struct btrfs_item);
-		list_add_tail(&next->tree_list, &head);
-		nitems++;
+	while (true) {
+		int next_size;
 
-		curr = next;
 		next = __btrfs_next_delayed_item(curr);
-		if (!next)
+		if (!next || !btrfs_is_continuous_delayed_item(curr, next))
 			break;
 
-		if (!btrfs_is_continuous_delayed_item(curr, next))
+		next_size = next->data_len + sizeof(struct btrfs_item);
+		if (total_size + next_size > max_size)
 			break;
-	}
 
-	if (!nitems) {
-		ret = 0;
-		goto out;
+		list_add_tail(&next->tree_list, &batch);
+		nitems++;
+		total_size += next_size;
+		curr = next;
 	}
 
-	keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS);
-	if (!keys) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	if (nitems == 1) {
+		ins_keys = &first_item->key;
+		ins_sizes = &first_item->data_len;
+	} else {
+		int i = 0;
 
-	data_size = kmalloc_array(nitems, sizeof(u32), GFP_NOFS);
-	if (!data_size) {
-		ret = -ENOMEM;
-		goto error;
+		ins_data = kmalloc(nitems * sizeof(u32) +
+				   nitems * sizeof(struct btrfs_key), GFP_NOFS);
+		if (!ins_data) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ins_sizes = (u32 *)ins_data;
+		ins_keys = (struct btrfs_key *)(ins_data + nitems * sizeof(u32));
+		list_for_each_entry(curr, &batch, tree_list) {
+			ins_keys[i] = curr->key;
+			ins_sizes[i] = curr->data_len;
+			i++;
+		}
 	}
 
-	/* get keys of all the delayed items */
-	i = 0;
-	list_for_each_entry(next, &head, tree_list) {
-		keys[i] = next->key;
-		data_size[i] = next->data_len;
-		i++;
-	}
+	nofs_flag = memalloc_nofs_save();
+	ret = btrfs_insert_empty_items(trans, root, path, ins_keys, ins_sizes,
+				       nitems);
+	memalloc_nofs_restore(nofs_flag);
+	if (ret)
+		goto out;
 
-	/* insert the keys of the items */
-	setup_items_for_insert(root, path, keys, data_size, nitems);
+	list_for_each_entry(curr, &batch, tree_list) {
+		char *data_ptr;
 
-	/* insert the dir index items */
-	slot = path->slots[0];
-	list_for_each_entry_safe(curr, next, &head, tree_list) {
-		data_ptr = btrfs_item_ptr(leaf, slot, char);
-		write_extent_buffer(leaf, &curr->data,
-				    (unsigned long)data_ptr,
-				    curr->data_len);
-		slot++;
+		data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
+		write_extent_buffer(path->nodes[0], &curr->data,
+				    (unsigned long)data_ptr, curr->data_len);
+		path->slots[0]++;
+	}
 
-		btrfs_delayed_item_release_metadata(root, curr);
+	/*
+	 * Now release our path before releasing the delayed items and their
+	 * metadata reservations, so that we don't block other tasks for more
+	 * time than needed.
+	 */
+	btrfs_release_path(path);
 
+	list_for_each_entry_safe(curr, next, &batch, tree_list) {
 		list_del(&curr->tree_list);
+		btrfs_delayed_item_release_metadata(root, curr);
 		btrfs_release_delayed_item(curr);
 	}
-
-error:
-	kfree(data_size);
-	kfree(keys);
 out:
+	kfree(ins_data);
 	return ret;
 }
 
-/*
- * This helper can just do simple insertion that needn't extend item for new
- * data, such as directory name index insertion, inode insertion.
- */
-static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
-				     struct btrfs_root *root,
-				     struct btrfs_path *path,
-				     struct btrfs_delayed_item *delayed_item)
-{
-	struct extent_buffer *leaf;
-	unsigned int nofs_flag;
-	char *ptr;
-	int ret;
-
-	nofs_flag = memalloc_nofs_save();
-	ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key,
-				      delayed_item->data_len);
-	memalloc_nofs_restore(nofs_flag);
-	if (ret < 0 && ret != -EEXIST)
-		return ret;
-
-	leaf = path->nodes[0];
-
-	ptr = btrfs_item_ptr(leaf, path->slots[0], char);
-
-	write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
-			    delayed_item->data_len);
-	btrfs_mark_buffer_dirty(leaf);
-
-	btrfs_delayed_item_release_metadata(root, delayed_item);
-	return 0;
-}
-
-/*
- * we insert an item first, then if there are some continuous items, we try
- * to insert those items into the same leaf.
- */
 static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans,
 				      struct btrfs_path *path,
 				      struct btrfs_root *root,
 				      struct btrfs_delayed_node *node)
 {
-	struct btrfs_delayed_item *curr, *prev;
 	int ret = 0;
 
-do_again:
-	mutex_lock(&node->mutex);
-	curr = __btrfs_first_delayed_insertion_item(node);
-	if (!curr)
-		goto insert_end;
-
-	ret = btrfs_insert_delayed_item(trans, root, path, curr);
-	if (ret < 0) {
-		btrfs_release_path(path);
-		goto insert_end;
-	}
+	while (ret == 0) {
+		struct btrfs_delayed_item *curr;
 
-	prev = curr;
-	curr = __btrfs_next_delayed_item(prev);
-	if (curr && btrfs_is_continuous_delayed_item(prev, curr)) {
-		/* insert the continuous items into the same leaf */
-		path->slots[0]++;
-		btrfs_batch_insert_items(root, path, curr);
+		mutex_lock(&node->mutex);
+		curr = __btrfs_first_delayed_insertion_item(node);
+		if (!curr) {
+			mutex_unlock(&node->mutex);
+			break;
+		}
+		ret = btrfs_insert_delayed_item(trans, root, path, curr);
+		mutex_unlock(&node->mutex);
 	}
-	btrfs_release_delayed_item(prev);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
-
-	btrfs_release_path(path);
-	mutex_unlock(&node->mutex);
-	goto do_again;
 
-insert_end:
-	mutex_unlock(&node->mutex);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 5a656c3628b241443fd07cda60f3b0587bb8c328 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 20 Jul 2021 16:05:23 +0100
Subject: btrfs: stop doing GFP_KERNEL memory allocations in the ref verify
 tool

In commit 351cbf6e4410e7 ("btrfs: use nofs allocations for running delayed
items") we wrapped all btree updates when running delayed items with
memalloc_nofs_save() and memalloc_nofs_restore(), due to a lock inversion
detected by lockdep involving reclaim and the mutex of delayed nodes.

The problem is because the ref verify tool does some memory allocations
with GFP_KERNEL, which can trigger reclaim and reclaim can trigger inode
eviction, which requires locking the mutex of an inode's delayed node.
On the other hand the ref verify tool is called when allocating metadata
extents as part of operations that modify a btree, which is a problem when
running delayed nodes, where we do btree updates while holding the mutex
of a delayed node. This is what caused the lockdep warning.

Instead of wrapping every btree update when running delayed nodes, change
the ref verify tool to never do GFP_KERNEL allocations, because:

1) We get less repeated code, which at the moment does not even have a
   comment mentioning why we need to setup the NOFS context, which is a
   recommended good practice as mentioned at
   Documentation/core-api/gfp_mask-from-fs-io.rst

2) The ref verify tool is something meant only for debugging and not
   something that should be enabled on non-debug / non-development
   kernels;

3) We may have yet more places outside delayed-inode.c where we have
   similar problem: doing btree updates while holding some lock and
   then having the GFP_KERNEL memory allocations, from the ref verify
   tool, trigger reclaim and trying again to acquire the same lock
   through the reclaim path.
   Or we could get more such cases in the future, therefore this change
   prevents getting into similar cases when using the ref verify tool.

Curiously most of the memory allocations done by the ref verify tool
were already using GFP_NOFS, except a few ones for no apparent reason.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-inode.c | 12 ------------
 fs/btrfs/ref-verify.c    | 10 +++++-----
 2 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 20f3e748027e..61452f04181a 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -6,7 +6,6 @@
 
 #include <linux/slab.h>
 #include <linux/iversion.h>
-#include <linux/sched/mm.h>
 #include "misc.h"
 #include "delayed-inode.h"
 #include "disk-io.h"
@@ -686,7 +685,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
 	const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info);
 	int total_size;
 	int nitems;
-	unsigned int nofs_flag;
 	char *ins_data = NULL;
 	struct btrfs_key *ins_keys;
 	u32 *ins_sizes;
@@ -735,10 +733,8 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
 		}
 	}
 
-	nofs_flag = memalloc_nofs_save();
 	ret = btrfs_insert_empty_items(trans, root, path, ins_keys, ins_sizes,
 				       nitems);
-	memalloc_nofs_restore(nofs_flag);
 	if (ret)
 		goto out;
 
@@ -860,7 +856,6 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
 				      struct btrfs_delayed_node *node)
 {
 	struct btrfs_delayed_item *curr, *prev;
-	unsigned int nofs_flag;
 	int ret = 0;
 
 do_again:
@@ -869,9 +864,7 @@ do_again:
 	if (!curr)
 		goto delete_fail;
 
-	nofs_flag = memalloc_nofs_save();
 	ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
-	memalloc_nofs_restore(nofs_flag);
 	if (ret < 0)
 		goto delete_fail;
 	else if (ret > 0) {
@@ -940,7 +933,6 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 	struct btrfs_inode_item *inode_item;
 	struct extent_buffer *leaf;
-	unsigned int nofs_flag;
 	int mod;
 	int ret;
 
@@ -953,9 +945,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
 	else
 		mod = 1;
 
-	nofs_flag = memalloc_nofs_save();
 	ret = btrfs_lookup_inode(trans, root, path, &key, mod);
-	memalloc_nofs_restore(nofs_flag);
 	if (ret > 0)
 		ret = -ENOENT;
 	if (ret < 0)
@@ -1012,9 +1002,7 @@ search:
 	key.type = BTRFS_INODE_EXTREF_KEY;
 	key.offset = -1;
 
-	nofs_flag = memalloc_nofs_save();
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-	memalloc_nofs_restore(nofs_flag);
 	if (ret < 0)
 		goto err_out;
 	ASSERT(ret);
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 8e026de74c44..d2062d5f71dd 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -264,8 +264,8 @@ static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info,
 	struct block_entry *be = NULL, *exist;
 	struct root_entry *re = NULL;
 
-	re = kzalloc(sizeof(struct root_entry), GFP_KERNEL);
-	be = kzalloc(sizeof(struct block_entry), GFP_KERNEL);
+	re = kzalloc(sizeof(struct root_entry), GFP_NOFS);
+	be = kzalloc(sizeof(struct block_entry), GFP_NOFS);
 	if (!be || !re) {
 		kfree(re);
 		kfree(be);
@@ -313,7 +313,7 @@ static int add_tree_block(struct btrfs_fs_info *fs_info, u64 ref_root,
 	struct root_entry *re;
 	struct ref_entry *ref = NULL, *exist;
 
-	ref = kmalloc(sizeof(struct ref_entry), GFP_KERNEL);
+	ref = kmalloc(sizeof(struct ref_entry), GFP_NOFS);
 	if (!ref)
 		return -ENOMEM;
 
@@ -358,7 +358,7 @@ static int add_shared_data_ref(struct btrfs_fs_info *fs_info,
 	struct block_entry *be;
 	struct ref_entry *ref;
 
-	ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL);
+	ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
 	if (!ref)
 		return -ENOMEM;
 	be = add_block_entry(fs_info, bytenr, num_bytes, 0);
@@ -393,7 +393,7 @@ static int add_extent_data_ref(struct btrfs_fs_info *fs_info,
 	u64 offset = btrfs_extent_data_ref_offset(leaf, dref);
 	u32 num_refs = btrfs_extent_data_ref_count(leaf, dref);
 
-	ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL);
+	ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
 	if (!ref)
 		return -ENOMEM;
 	be = add_block_entry(fs_info, bytenr, num_bytes, ref_root);
-- 
cgit v1.2.3


From cceaa89f02f15f232391ae4be214137b0a0285c0 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 20 Jul 2021 16:03:40 +0100
Subject: btrfs: remove racy and unnecessary inode transaction update when
 using no-holes

When using the NO_HOLES feature and expanding the size of an inode, we
update the inode's last_trans, last_sub_trans and last_log_commit fields
at maybe_insert_hole() so that a fsync does know that the inode needs to
be logged (by making sure that btrfs_inode_in_log() returns false). This
happens for expanding truncate operations, buffered writes, direct IO
writes and when cloning extents to an offset greater than the inode's
i_size.

However the way we do it is racy, because in between setting the inode's
last_sub_trans and last_log_commit fields, the log transaction ID that was
assigned to last_sub_trans might be committed before we read the root's
last_log_commit and assign that value to last_log_commit. If that happens
it would make a future call to btrfs_inode_in_log() return true. This is
a race that should be extremely unlikely to be hit in practice, and it is
the same that was described by commit bc0939fcfab0d7 ("btrfs: fix race
between marking inode needs to be logged and log syncing").

The fix would simply be to set last_log_commit to the value we assigned
to last_sub_trans minus 1, like it was done in that commit. However
updating these two fields plus the last_trans field is pointless here
because all the callers of btrfs_cont_expand() (which is the only
caller of maybe_insert_hole()) always call btrfs_set_inode_last_trans()
or btrfs_update_inode() after calling btrfs_cont_expand(). Calling either
btrfs_set_inode_last_trans() or btrfs_update_inode() guarantees that the
next fsync will log the inode, as it makes btrfs_inode_in_log() return
false.

So just remove the code that explicitly sets the inode's last_trans,
last_sub_trans and last_log_commit fields.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c84d42a290c6..deec968f937f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5087,15 +5087,13 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
 	int ret;
 
 	/*
-	 * Still need to make sure the inode looks like it's been updated so
-	 * that any holes get logged if we fsync.
+	 * If NO_HOLES is enabled, we don't need to do anything.
+	 * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
+	 * or btrfs_update_inode() will be called, which guarantee that the next
+	 * fsync will know this inode was changed and needs to be logged.
 	 */
-	if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
-		inode->last_trans = fs_info->generation;
-		inode->last_sub_trans = root->log_transid;
-		inode->last_log_commit = root->last_log_commit;
+	if (btrfs_fs_incompat(fs_info, NO_HOLES))
 		return 0;
-	}
 
 	/*
 	 * 1 - for the one we're dropping
-- 
cgit v1.2.3


From e1a6d2648300ef4cfdcfd4838224fe5cefe3caaa Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 20 Jul 2021 16:03:41 +0100
Subject: btrfs: avoid unnecessary log mutex contention when syncing log

When syncing the log we acquire the root's log mutex just to update the
root's last_log_commit. This is unnecessary because:

1) At this point there can only be one task updating this value, which is
   the task committing the current log transaction. Any task that enters
   btrfs_sync_log() has to wait for the previous log transaction to commit
   and wait for the current log transaction to commit if someone else
   already started it (in this case it never reaches to the point of
   updating last_log_commit, as that is done by the committing task);

2) All readers of the root's last_log_commit don't acquire the root's
   log mutex. This is to avoid blocking the readers, potentially for too
   long and because getting a stale value of last_log_commit does not
   cause any functional problem, in the worst case getting a stale value
   results in logging an inode unnecessarily. Plus it's actually very
   rare to get a stale value that results in unnecessarily logging the
   inode.

So in order to avoid unnecessary contention on the root's log mutex,
which is used for several different purposes, like starting/joining a
log transaction and starting writeback of a log transaction, stop
acquiring the log mutex for updating the root's last_log_commit.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e6430ac9bbe8..a7ce23b9d2ee 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3328,10 +3328,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		goto out_wake_log_root;
 	}
 
-	mutex_lock(&root->log_mutex);
-	if (root->last_log_commit < log_transid)
-		root->last_log_commit = log_transid;
-	mutex_unlock(&root->log_mutex);
+	/*
+	 * We know there can only be one task here, since we have not yet set
+	 * root->log_commit[index1] to 0 and any task attempting to sync the
+	 * log must wait for the previous log transaction to commit if it's
+	 * still in progress or wait for the current log transaction commit if
+	 * someone else already started it. We use <= and not < because the
+	 * first log transaction has an ID of 0.
+	 */
+	ASSERT(root->last_log_commit <= log_transid);
+	root->last_log_commit = log_transid;
 
 out_wake_log_root:
 	mutex_lock(&log_root_tree->log_mutex);
-- 
cgit v1.2.3


From e68107e51f8466e1fae40d64b873d0a11398a628 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 20 Jul 2021 16:03:42 +0100
Subject: btrfs: remove unnecessary list head initialization when syncing log

One of the last steps of syncing the log is to remove all log contexts
from the root's list of contexts, done at btrfs_remove_all_log_ctxs().
There we iterate over all the contexts in the list and delete each one
from the list, and after that we call INIT_LIST_HEAD() on the list. That
is unnecessary since at that point the list is empty.

So just remove the INIT_LIST_HEAD() call. It's not needed, increases code
size (bloat-o-meter reported a delta of -122 for btrfs_sync_log() after
this change) and increases two critical sections delimited by log mutexes.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a7ce23b9d2ee..3e6c8f8a2909 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3039,8 +3039,6 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
 		list_del_init(&ctx->list);
 		ctx->log_ret = error;
 	}
-
-	INIT_LIST_HEAD(&root->log_ctxs[index]);
 }
 
 /*
-- 
cgit v1.2.3


From 2ac691d8b3b1dd300a48b1763fa3a1434863070b Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 20 Jul 2021 16:03:43 +0100
Subject: btrfs: avoid unnecessary lock and leaf splits when updating inode in
 the log

During a fast fsync, if we have already fsynced the file before and in the
current transaction, we can make the inode item update more efficient and
avoid acquiring a write lock on the leaf's parent.

To update the inode item we are always using btrfs_insert_empty_item() to
get a path pointing to the inode item, which calls btrfs_search_slot()
with an "ins_len" argument of 'sizeof(struct btrfs_inode_item) +
sizeof(struct btrfs_item)', and that always results in the search taking
a write lock on the level 1 node that is the parent of the leaf that
contains the inode item. This adds unnecessary lock contention on log
trees when we have multiple fsyncs in parallel against inodes in the same
subvolume, which has a very significant impact due to the fact that log
trees are short lived and their height very rarely goes beyond level 2.

Also, by using btrfs_insert_empty_item() when we need to update the inode
item, we also end up splitting the leaf of the existing inode item when
the leaf has an amount of free space smaller than the size of an inode
item.

Improve this by using btrfs_seach_slot(), with a 0 "ins_len" argument,
when we know the inode item already exists in the log. This avoids these
two inefficiencies.

The following script, using fio, was used to perform the tests:

  $ cat fio-test.sh
  #!/bin/bash

  DEV=/dev/nvme0n1
  MNT=/mnt/nvme0n1
  MOUNT_OPTIONS="-o ssd"
  MKFS_OPTIONS="-d single -m single"

  if [ $# -ne 4 ]; then
    echo "Use $0 NUM_JOBS FILE_SIZE FSYNC_FREQ BLOCK_SIZE"
    exit 1
  fi

  NUM_JOBS=$1
  FILE_SIZE=$2
  FSYNC_FREQ=$3
  BLOCK_SIZE=$4

  cat <<EOF > /tmp/fio-job.ini
  [writers]
  rw=randwrite
  fsync=$FSYNC_FREQ
  fallocate=none
  group_reporting=1
  direct=0
  bs=$BLOCK_SIZE
  ioengine=sync
  size=$FILE_SIZE
  directory=$MNT
  numjobs=$NUM_JOBS
  EOF

  echo "performance" | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor

  echo
  echo "Using config:"
  echo
  cat /tmp/fio-job.ini
  echo
  echo "mount options: $MOUNT_OPTIONS"
  echo

  umount $MNT &> /dev/null
  mkfs.btrfs -f $MKFS_OPTIONS $DEV
  mount $MOUNT_OPTIONS $DEV $MNT

  fio /tmp/fio-job.ini
  umount $MNT

The tests were done on a physical machine, with 12 cores, 64G of RAM,
using a NVMEe device and using a non-debug kernel config (the default one
from Debian). The summary line from fio is provided below for each test
run.

With 8 jobs, file size 256M, fsync frequency of 4 and a block size of 4K:

Before: WRITE: bw=28.3MiB/s (29.7MB/s), 28.3MiB/s-28.3MiB/s (29.7MB/s-29.7MB/s), io=2048MiB (2147MB), run=72297-72297msec
After:  WRITE: bw=28.7MiB/s (30.1MB/s), 28.7MiB/s-28.7MiB/s (30.1MB/s-30.1MB/s), io=2048MiB (2147MB), run=71411-71411msec

+1.4% throughput, -1.2% runtime

With 16 jobs, file size 256M, fsync frequency of 4 and a block size of 4K:

Before: WRITE: bw=40.0MiB/s (42.0MB/s), 40.0MiB/s-40.0MiB/s (42.0MB/s-42.0MB/s), io=4096MiB (4295MB), run=99980-99980msec
After:  WRITE: bw=40.9MiB/s (42.9MB/s), 40.9MiB/s-40.9MiB/s (42.9MB/s-42.9MB/s), io=4096MiB (4295MB), run=97933-97933msec

+2.2% throughput, -2.1% runtime

The changes are small but it's possible to be better on faster hardware as
in the test machine used disk utilization was pretty much 100% during the
whole time the tests were running (observed with 'iostat -xz 1').

The tests also included the previous patch with the subject of:
"btrfs: avoid unnecessary log mutex contention when syncing log".
So they compared a branch without that patch and without this patch versus
a branch with these two patches applied.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 39 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3e6c8f8a2909..8dde5c08a48f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3972,14 +3972,41 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 
 static int log_inode_item(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *log, struct btrfs_path *path,
-			  struct btrfs_inode *inode)
+			  struct btrfs_inode *inode, bool inode_item_dropped)
 {
 	struct btrfs_inode_item *inode_item;
 	int ret;
 
-	ret = btrfs_insert_empty_item(trans, log, path,
-				      &inode->location, sizeof(*inode_item));
-	if (ret && ret != -EEXIST)
+	/*
+	 * If we are doing a fast fsync and the inode was logged before in the
+	 * current transaction, then we know the inode was previously logged and
+	 * it exists in the log tree. For performance reasons, in this case use
+	 * btrfs_search_slot() directly with ins_len set to 0 so that we never
+	 * attempt a write lock on the leaf's parent, which adds unnecessary lock
+	 * contention in case there are concurrent fsyncs for other inodes of the
+	 * same subvolume. Using btrfs_insert_empty_item() when the inode item
+	 * already exists can also result in unnecessarily splitting a leaf.
+	 */
+	if (!inode_item_dropped && inode->logged_trans == trans->transid) {
+		ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
+		ASSERT(ret <= 0);
+		if (ret > 0)
+			ret = -ENOENT;
+	} else {
+		/*
+		 * This means it is the first fsync in the current transaction,
+		 * so the inode item is not in the log and we need to insert it.
+		 * We can never get -EEXIST because we are only called for a fast
+		 * fsync and in case an inode eviction happens after the inode was
+		 * logged before in the current transaction, when we load again
+		 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
+		 * flags and set ->logged_trans to 0.
+		 */
+		ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
+					      sizeof(*inode_item));
+		ASSERT(ret != -EEXIST);
+	}
+	if (ret)
 		return ret;
 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				    struct btrfs_inode_item);
@@ -5303,6 +5330,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	bool need_log_inode_item = true;
 	bool xattrs_logged = false;
 	bool recursive_logging = false;
+	bool inode_item_dropped = true;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -5437,6 +5465,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 		} else {
 			if (inode_only == LOG_INODE_ALL)
 				fast_search = true;
+			inode_item_dropped = false;
 			goto log_extents;
 		}
 
@@ -5470,7 +5499,7 @@ log_extents:
 	btrfs_release_path(path);
 	btrfs_release_path(dst_path);
 	if (need_log_inode_item) {
-		err = log_inode_item(trans, log, dst_path, inode);
+		err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
 		if (err)
 			goto out_unlock;
 		/*
-- 
cgit v1.2.3


From c7bcbb2120cb74ce8757e310e5ceea1f3a139597 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 22 Jul 2021 15:58:10 +0100
Subject: btrfs: remove ignore_offset argument from btrfs_find_all_roots()

Currently all the callers of btrfs_find_all_roots() pass a value of false
for its ignore_offset argument. This makes the argument pointless and we
can remove it and make btrfs_find_all_roots() always pass false as the
ignore_offset argument for btrfs_find_all_roots_safe(). So just do that.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c            |  4 ++--
 fs/btrfs/backref.h            |  2 +-
 fs/btrfs/qgroup.c             |  8 ++++----
 fs/btrfs/tests/qgroup-tests.c | 30 ++++++++++--------------------
 4 files changed, 17 insertions(+), 27 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 728c728d03f3..f735b8798ba1 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1488,14 +1488,14 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
 int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
 			 struct btrfs_fs_info *fs_info, u64 bytenr,
 			 u64 time_seq, struct ulist **roots,
-			 bool ignore_offset, bool skip_commit_root_sem)
+			 bool skip_commit_root_sem)
 {
 	int ret;
 
 	if (!trans && !skip_commit_root_sem)
 		down_read(&fs_info->commit_root_sem);
 	ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr,
-					time_seq, roots, ignore_offset);
+					time_seq, roots, false);
 	if (!trans && !skip_commit_root_sem)
 		up_read(&fs_info->commit_root_sem);
 	return ret;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index ff5f07f9940b..ba454032dbe2 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -47,7 +47,7 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
 			 const u64 *extent_item_pos, bool ignore_offset);
 int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
 			 struct btrfs_fs_info *fs_info, u64 bytenr,
-			 u64 time_seq, struct ulist **roots, bool ignore_offset,
+			 u64 time_seq, struct ulist **roots,
 			 bool skip_commit_root_sem);
 char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
 			u32 name_len, unsigned long name_off,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 0fa121171ca1..db680f5be745 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1733,7 +1733,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
 	ASSERT(trans != NULL);
 
 	ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root,
-				   false, true);
+				   true);
 	if (ret < 0) {
 		trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
 		btrfs_warn(trans->fs_info,
@@ -2651,7 +2651,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
 				/* Search commit root to find old_roots */
 				ret = btrfs_find_all_roots(NULL, fs_info,
 						record->bytenr, 0,
-						&record->old_roots, false, false);
+						&record->old_roots, false);
 				if (ret < 0)
 					goto cleanup;
 			}
@@ -2667,7 +2667,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
 			 * current root. It's safe inside commit_transaction().
 			 */
 			ret = btrfs_find_all_roots(trans, fs_info,
-			   record->bytenr, BTRFS_SEQ_LAST, &new_roots, false, false);
+			   record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
 			if (ret < 0)
 				goto cleanup;
 			if (qgroup_to_skip) {
@@ -3201,7 +3201,7 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
 			num_bytes = found.offset;
 
 		ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
-					   &roots, false, false);
+					   &roots, false);
 		if (ret < 0)
 			goto out;
 		/* For rescan, just pass old_roots as NULL */
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 98b5aaba46f1..19ba7d5b7d8f 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -223,8 +223,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
 	 * we can only call btrfs_qgroup_account_extent() directly to test
 	 * quota.
 	 */
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
-			false, false);
+	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
 	if (ret) {
 		ulist_free(old_roots);
 		test_err("couldn't find old roots: %d", ret);
@@ -236,8 +235,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
 	if (ret)
 		return ret;
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
-			false, false);
+	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
 	if (ret) {
 		ulist_free(old_roots);
 		ulist_free(new_roots);
@@ -260,8 +258,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
 	old_roots = NULL;
 	new_roots = NULL;
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
-			false, false);
+	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
 	if (ret) {
 		ulist_free(old_roots);
 		test_err("couldn't find old roots: %d", ret);
@@ -272,8 +269,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
 	if (ret)
 		return -EINVAL;
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
-			false, false);
+	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
 	if (ret) {
 		ulist_free(old_roots);
 		ulist_free(new_roots);
@@ -324,8 +320,7 @@ static int test_multiple_refs(struct btrfs_root *root,
 		return ret;
 	}
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
-			false, false);
+	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
 	if (ret) {
 		ulist_free(old_roots);
 		test_err("couldn't find old roots: %d", ret);
@@ -337,8 +332,7 @@ static int test_multiple_refs(struct btrfs_root *root,
 	if (ret)
 		return ret;
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
-			false, false);
+	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
 	if (ret) {
 		ulist_free(old_roots);
 		ulist_free(new_roots);
@@ -359,8 +353,7 @@ static int test_multiple_refs(struct btrfs_root *root,
 		return -EINVAL;
 	}
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
-			false, false);
+	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
 	if (ret) {
 		ulist_free(old_roots);
 		test_err("couldn't find old roots: %d", ret);
@@ -372,8 +365,7 @@ static int test_multiple_refs(struct btrfs_root *root,
 	if (ret)
 		return ret;
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
-			false, false);
+	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
 	if (ret) {
 		ulist_free(old_roots);
 		ulist_free(new_roots);
@@ -400,8 +392,7 @@ static int test_multiple_refs(struct btrfs_root *root,
 		return -EINVAL;
 	}
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
-			false, false);
+	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
 	if (ret) {
 		ulist_free(old_roots);
 		test_err("couldn't find old roots: %d", ret);
@@ -413,8 +404,7 @@ static int test_multiple_refs(struct btrfs_root *root,
 	if (ret)
 		return ret;
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
-			false, false);
+	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
 	if (ret) {
 		ulist_free(old_roots);
 		ulist_free(new_roots);
-- 
cgit v1.2.3


From ad9a9378502d5a9da3a47666878246b9404a3391 Mon Sep 17 00:00:00 2001
From: Marcos Paulo de Souza <mpdesouza@suse.com>
Date: Tue, 13 Jul 2021 10:58:03 -0300
Subject: btrfs: use btrfs_next_leaf instead of btrfs_next_item when slots >
 nritems

After calling btrfs_search_slot is a common practice to check if the
slot found isn't bigger than number of slots in the current leaf, and if
so, search for the same key in the next leaf by calling btrfs_next_leaf,
which calls btrfs_next_old_leaf to do the job.

Calling btrfs_next_item in the same situation would end up in the same
code flow, since

* btrfs_next_item
  * btrfs_next_old_item
    * if slot >= nritems(curr_leaf)
      btrfs_next_old_leaf

Change btrfs_verify_dev_extents and calculate_emulated_zone_size
functions to use btrfs_next_leaf in the same situation.

Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 2 +-
 fs/btrfs/zoned.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index eb734099ccba..01127a4dfe91 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -8052,7 +8052,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
 		goto out;
 
 	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
-		ret = btrfs_next_item(root, path);
+		ret = btrfs_next_leaf(root, path);
 		if (ret < 0)
 			goto out;
 		/* No dev extents at all? Not good */
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 4f8bfceda095..47af1ab3bf12 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -245,7 +245,7 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
 		goto out;
 
 	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
-		ret = btrfs_next_item(root, path);
+		ret = btrfs_next_leaf(root, path);
 		if (ret < 0)
 			goto out;
 		/* No dev extents at all? Not good */
-- 
cgit v1.2.3


From f8ee80de7bcf57bada19df887d8a7f87fd179cfa Mon Sep 17 00:00:00 2001
From: Marcos Paulo de Souza <mpdesouza@suse.com>
Date: Mon, 26 Jul 2021 15:51:07 -0300
Subject: btrfs: remove unneeded return variable in btrfs_lookup_file_extent

We can return from btrfs_search_slot directly which also shows that it
follows the same return value convention.

Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file-item.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index df6631eefc65..2673c6ba7a4e 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -233,7 +233,6 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_path *path, u64 objectid,
 			     u64 offset, int mod)
 {
-	int ret;
 	struct btrfs_key file_key;
 	int ins_len = mod < 0 ? -1 : 0;
 	int cow = mod != 0;
@@ -241,8 +240,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 	file_key.objectid = objectid;
 	file_key.offset = offset;
 	file_key.type = BTRFS_EXTENT_DATA_KEY;
-	ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
-	return ret;
+
+	return btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
 }
 
 /*
-- 
cgit v1.2.3


From a7d1c5dc8632e9b370ad26478c468d4e4e29f263 Mon Sep 17 00:00:00 2001
From: Marcos Paulo de Souza <mpdesouza@suse.com>
Date: Mon, 26 Jul 2021 16:19:09 -0300
Subject: btrfs: introduce btrfs_lookup_match_dir

btrfs_search_slot is called in multiple places in dir-item.c to search
for a dir entry, and then calling btrfs_match_dir_name to return a
btrfs_dir_item.

In order to reduce the number of callers of btrfs_search_slot, create a
common function that looks for the dir key, and if found call
btrfs_match_dir_item_name.

Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dir-item.c | 76 +++++++++++++++++++++++++++--------------------------
 1 file changed, 39 insertions(+), 37 deletions(-)

diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 98b63ebed539..f1274d5c3805 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -170,6 +170,25 @@ out_free:
 	return 0;
 }
 
+static struct btrfs_dir_item *btrfs_lookup_match_dir(
+			struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, struct btrfs_path *path,
+			struct btrfs_key *key, const char *name,
+			int name_len, int mod)
+{
+	const int ins_len = (mod < 0 ? -1 : 0);
+	const int cow = (mod != 0);
+	int ret;
+
+	ret = btrfs_search_slot(trans, root, key, path, ins_len, cow);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0)
+		return ERR_PTR(-ENOENT);
+
+	return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+}
+
 /*
  * lookup a directory item based on name.  'dir' is the objectid
  * we're searching in, and 'mod' tells us if you plan on deleting the
@@ -181,23 +200,18 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 					     const char *name, int name_len,
 					     int mod)
 {
-	int ret;
 	struct btrfs_key key;
-	int ins_len = mod < 0 ? -1 : 0;
-	int cow = mod != 0;
+	struct btrfs_dir_item *di;
 
 	key.objectid = dir;
 	key.type = BTRFS_DIR_ITEM_KEY;
-
 	key.offset = btrfs_name_hash(name, name_len);
 
-	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
-	if (ret < 0)
-		return ERR_PTR(ret);
-	if (ret > 0)
+	di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+	if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
 		return NULL;
 
-	return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+	return di;
 }
 
 int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
@@ -211,7 +225,6 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
 	int slot;
 	struct btrfs_path *path;
 
-
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -220,20 +233,20 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
 	key.type = BTRFS_DIR_ITEM_KEY;
 	key.offset = btrfs_name_hash(name, name_len);
 
-	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-
-	/* return back any errors */
-	if (ret < 0)
-		goto out;
+	di = btrfs_lookup_match_dir(NULL, root, path, &key, name, name_len, 0);
+	if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
+		/* Nothing found, we're safe */
+		if (ret == -ENOENT) {
+			ret = 0;
+			goto out;
+		}
 
-	/* nothing found, we're safe */
-	if (ret > 0) {
-		ret = 0;
-		goto out;
+		if (ret < 0)
+			goto out;
 	}
 
 	/* we found an item, look for our name in the item */
-	di = btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
 	if (di) {
 		/* our exact name was found */
 		ret = -EEXIST;
@@ -274,21 +287,13 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
 			    u64 objectid, const char *name, int name_len,
 			    int mod)
 {
-	int ret;
 	struct btrfs_key key;
-	int ins_len = mod < 0 ? -1 : 0;
-	int cow = mod != 0;
 
 	key.objectid = dir;
 	key.type = BTRFS_DIR_INDEX_KEY;
 	key.offset = objectid;
 
-	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
-	if (ret < 0)
-		return ERR_PTR(ret);
-	if (ret > 0)
-		return ERR_PTR(-ENOENT);
-	return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+	return btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
 }
 
 struct btrfs_dir_item *
@@ -345,21 +350,18 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
 					  const char *name, u16 name_len,
 					  int mod)
 {
-	int ret;
 	struct btrfs_key key;
-	int ins_len = mod < 0 ? -1 : 0;
-	int cow = mod != 0;
+	struct btrfs_dir_item *di;
 
 	key.objectid = dir;
 	key.type = BTRFS_XATTR_ITEM_KEY;
 	key.offset = btrfs_name_hash(name, name_len);
-	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
-	if (ret < 0)
-		return ERR_PTR(ret);
-	if (ret > 0)
+
+	di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+	if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
 		return NULL;
 
-	return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+	return di;
 }
 
 /*
-- 
cgit v1.2.3


From a129ffb8166a5a87162f79b0dd013044df68e497 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 27 Jul 2021 13:41:32 +0800
Subject: btrfs: remove unused start and end parameters from
 btrfs_run_delalloc_range()

Since commit d75855b4518b ("btrfs: Remove
extent_io_ops::writepage_start_hook") removes the writepage_start_hook()
and adds btrfs_writepage_cow_fixup() function, there is no need to
follow the old hook parameters.

Remove the @start and @end hook, since currently the fixup check is full
page check, it doesn't need @start and @end hook.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h     | 2 +-
 fs/btrfs/extent_io.c | 7 +++----
 fs/btrfs/inode.c     | 2 +-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4a69aa604ac5..f265fb4f417f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3192,7 +3192,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
 int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
 		u64 start, u64 end, int *page_started, unsigned long *nr_written,
 		struct writeback_control *wbc);
-int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
+int btrfs_writepage_cow_fixup(struct page *page);
 void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
 					  struct page *page, u64 start,
 					  u64 end, int uptodate);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1f947e24091a..05637ef22b9e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3828,9 +3828,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 				 int *nr_ret)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-	u64 start = page_offset(page);
-	u64 end = start + PAGE_SIZE - 1;
-	u64 cur = start;
+	u64 cur = page_offset(page);
+	u64 end = cur + PAGE_SIZE - 1;
 	u64 extent_offset;
 	u64 block_start;
 	struct extent_map *em;
@@ -3840,7 +3839,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 	const unsigned int write_flags = wbc_to_write_flags(wbc);
 	bool compressed;
 
-	ret = btrfs_writepage_cow_fixup(page, start, end);
+	ret = btrfs_writepage_cow_fixup(page);
 	if (ret) {
 		/* Fixup worker will requeue */
 		redirty_page_for_writepage(wbc, page);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index deec968f937f..30f2dc421cef 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2769,7 +2769,7 @@ out_page:
  * to fix it up.  The async helper will wait for ordered extents, set
  * the delalloc bit and make it safe to write the page.
  */
-int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
+int btrfs_writepage_cow_fixup(struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-- 
cgit v1.2.3


From 25c1252a026c6c34ff99c86f31856701b2192c0e Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 26 Jul 2021 14:15:08 +0200
Subject: btrfs: switch uptodate to bool in
 btrfs_writepage_endio_finish_ordered

The uptodate parameter should be bool, change the type.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h     | 2 +-
 fs/btrfs/extent_io.c | 8 ++++----
 fs/btrfs/inode.c     | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f265fb4f417f..194002f4e611 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3195,7 +3195,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
 int btrfs_writepage_cow_fixup(struct page *page);
 void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
 					  struct page *page, u64 start,
-					  u64 end, int uptodate);
+					  u64 end, bool uptodate);
 extern const struct dentry_operations btrfs_dentry_operations;
 extern const struct iomap_ops btrfs_dio_iomap_ops;
 extern const struct iomap_dio_ops btrfs_dio_ops;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 05637ef22b9e..19ba5b03b2df 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2779,7 +2779,7 @@ next:
 void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
 {
 	struct btrfs_inode *inode;
-	int uptodate = (err == 0);
+	const bool uptodate = (err == 0);
 	int ret = 0;
 
 	ASSERT(page && page->mapping);
@@ -3863,7 +3863,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 
 		if (cur >= i_size) {
 			btrfs_writepage_endio_finish_ordered(inode, page, cur,
-							     end, 1);
+							     end, true);
 			break;
 		}
 
@@ -3913,7 +3913,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 				nr++;
 			else
 				btrfs_writepage_endio_finish_ordered(inode,
-						page, cur, cur + iosize - 1, 1);
+						page, cur, cur + iosize - 1, true);
 			cur += iosize;
 			continue;
 		}
@@ -4982,7 +4982,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
 			ret = __extent_writepage(page, &wbc_writepages, &epd);
 		else {
 			btrfs_writepage_endio_finish_ordered(BTRFS_I(inode),
-					page, start, start + PAGE_SIZE - 1, 1);
+					page, start, start + PAGE_SIZE - 1, true);
 			unlock_page(page);
 		}
 		put_page(page);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 30f2dc421cef..d41aa8e5d0cc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -972,7 +972,7 @@ retry:
 
 			p->mapping = inode->vfs_inode.i_mapping;
 			btrfs_writepage_endio_finish_ordered(inode, p, start,
-							     end, 0);
+							     end, false);
 
 			p->mapping = NULL;
 			extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
@@ -3170,7 +3170,7 @@ static void finish_ordered_fn(struct btrfs_work *work)
 
 void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
 					  struct page *page, u64 start,
-					  u64 end, int uptodate)
+					  u64 end, bool uptodate)
 {
 	trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
 
-- 
cgit v1.2.3


From f41b6ba93d8ef990c4acc70987bbc138c1926ebb Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 26 Jul 2021 14:15:10 +0200
Subject: btrfs: remove uptodate parameter from
 btrfs_dec_test_first_ordered_pending

In commit e65f152e4348 ("btrfs: refactor how we finish ordered extent io
for endio functions") there was last caller not using 1 for the uptodate
parameter. Now there's only one, passing 1, so we can remove it and
simplify the code.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c        | 2 +-
 fs/btrfs/ordered-data.c | 5 +----
 fs/btrfs/ordered-data.h | 2 +-
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d41aa8e5d0cc..83caf69295f7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8554,7 +8554,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
 		spin_unlock_irq(&inode->ordered_tree.lock);
 
 		if (btrfs_dec_test_ordered_pending(inode, &ordered,
-					cur, range_end + 1 - cur, 1)) {
+						   cur, range_end + 1 - cur)) {
 			btrfs_finish_ordered_io(ordered);
 			/*
 			 * The ordered extent has finished, now we're again
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 5c0f8481e25e..edb65abf0393 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -446,7 +446,6 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
  * 		 Will be also used to store the finished ordered extent.
  * @file_offset: File offset for the finished IO
  * @io_size:	 Length of the finish IO range
- * @uptodate:	 If the IO finishes without problem
  *
  * Return true if the ordered extent is finished in the range, and update
  * @cached.
@@ -457,7 +456,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
  */
 bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
 				    struct btrfs_ordered_extent **cached,
-				    u64 file_offset, u64 io_size, int uptodate)
+				    u64 file_offset, u64 io_size)
 {
 	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
 	struct rb_node *node;
@@ -486,8 +485,6 @@ have_entry:
 		       entry->bytes_left, io_size);
 
 	entry->bytes_left -= io_size;
-	if (!uptodate)
-		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
 
 	if (entry->bytes_left == 0) {
 		/*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index b2d88aba8420..4194e960ff61 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -177,7 +177,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
 				bool uptodate);
 bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
 				    struct btrfs_ordered_extent **cached,
-				    u64 file_offset, u64 io_size, int uptodate);
+				    u64 file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
 			     u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
 			     int type);
-- 
cgit v1.2.3


From 809d6902b3b05fd6b4494ff1460c227b99fcb4c3 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 26 Jul 2021 14:15:12 +0200
Subject: btrfs: make btrfs_next_leaf static inline

btrfs_next_leaf is a simple wrapper for btrfs_next_old_leaf so move it
to header to avoid the function call overhead.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 10 ----------
 fs/btrfs/ctree.h | 13 ++++++++++++-
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 63c026495193..99b33a5b33c8 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4357,16 +4357,6 @@ next:
 	return 1;
 }
 
-/*
- * search the tree again to find a leaf with greater keys
- * returns 0 if it found something or 1 if there are no greater leaves.
- * returns < 0 on io errors.
- */
-int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
-{
-	return btrfs_next_old_leaf(root, path, 0);
-}
-
 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
 			u64 time_seq)
 {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 194002f4e611..3cccf0f05666 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2899,7 +2899,6 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 	return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
 }
 
-int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
 			u64 time_seq);
@@ -2911,6 +2910,18 @@ static inline int btrfs_next_old_item(struct btrfs_root *root,
 		return btrfs_next_old_leaf(root, p, time_seq);
 	return 0;
 }
+
+/*
+ * Search the tree again to find a leaf with greater keys.
+ *
+ * Returns 0 if it found something or 1 if there are no greater leaves.
+ * Returns < 0 on error.
+ */
+static inline int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+	return btrfs_next_old_leaf(root, path, 0);
+}
+
 static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
 {
 	return btrfs_next_old_item(root, p, 0);
-- 
cgit v1.2.3


From 0ac6e06b6c137e18d95070fdd3c6cbd319005ffb Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 26 Jul 2021 14:15:15 +0200
Subject: btrfs: tree-checker: use table values for stripe checks

There are hardcoded values in several checks regarding chunks and stripe
constraints. We have that defined in the raid table and ought to use it.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-checker.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index a8b2e0d2c025..ac9416cb4496 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -873,13 +873,18 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
 		}
 	}
 
-	if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
-		     (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
-		     (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
-		     (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
-		     (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
+	if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 &&
+		      sub_stripes != btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes) ||
+		     (type & BTRFS_BLOCK_GROUP_RAID1 &&
+		      num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1].devs_min) ||
+		     (type & BTRFS_BLOCK_GROUP_RAID5 &&
+		      num_stripes < btrfs_raid_array[BTRFS_RAID_RAID5].devs_min) ||
+		     (type & BTRFS_BLOCK_GROUP_RAID6 &&
+		      num_stripes < btrfs_raid_array[BTRFS_RAID_RAID6].devs_min) ||
+		     (type & BTRFS_BLOCK_GROUP_DUP &&
+		      num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) ||
 		     ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
-		      num_stripes != 1))) {
+		      num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) {
 		chunk_err(leaf, chunk, logical,
 			"invalid num_stripes:sub_stripes %u:%u for profile %llu",
 			num_stripes, sub_stripes,
-- 
cgit v1.2.3


From 6c154ba41bd0b925428e73571df2f80dc8d082ba Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 26 Jul 2021 14:15:17 +0200
Subject: btrfs: tree-checker: add missing stripe checks for raid1c3/4 profiles

The stripe checks for raid1c3/raid1c4 are missing in the sequence in
btrfs_check_chunk_valid.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-checker.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index ac9416cb4496..7ba94b683ee3 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -877,6 +877,10 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
 		      sub_stripes != btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes) ||
 		     (type & BTRFS_BLOCK_GROUP_RAID1 &&
 		      num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1].devs_min) ||
+		     (type & BTRFS_BLOCK_GROUP_RAID1C3 &&
+		      num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C3].devs_min) ||
+		     (type & BTRFS_BLOCK_GROUP_RAID1C4 &&
+		      num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C4].devs_min) ||
 		     (type & BTRFS_BLOCK_GROUP_RAID5 &&
 		      num_stripes < btrfs_raid_array[BTRFS_RAID_RAID5].devs_min) ||
 		     (type & BTRFS_BLOCK_GROUP_RAID6 &&
-- 
cgit v1.2.3


From 500a44c9b301ae1844e38606c4bff4f15c174fb0 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 26 Jul 2021 14:15:19 +0200
Subject: btrfs: uninline btrfs_bg_flags_to_raid_index

The helper does a simple translation from block group flags to index to
the btrfs_raid_array table. There's no apparent reason to inline the
function, the translation happens usually once per function and is not
called in a loop.

Making it a proper function saves quite some binary code (x86_64,
release config):

   text    data     bss     dec     hex filename
1164011   19253   14912 1198176  124860 pre/btrfs.ko
1161559   19253   14912 1195724  123ecc post/btrfs.ko

DELTA: -2451

Also add the const attribute as there are no side effects, this could
help compiler to optimize a few things without the function body.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 26 ++++++++++++++++++++++++++
 fs/btrfs/volumes.h | 27 +--------------------------
 2 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 01127a4dfe91..40d850a7beba 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -153,6 +153,32 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 	},
 };
 
+/*
+ * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
+ * can be used as index to access btrfs_raid_array[].
+ */
+enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
+{
+	if (flags & BTRFS_BLOCK_GROUP_RAID10)
+		return BTRFS_RAID_RAID10;
+	else if (flags & BTRFS_BLOCK_GROUP_RAID1)
+		return BTRFS_RAID_RAID1;
+	else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
+		return BTRFS_RAID_RAID1C3;
+	else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
+		return BTRFS_RAID_RAID1C4;
+	else if (flags & BTRFS_BLOCK_GROUP_DUP)
+		return BTRFS_RAID_DUP;
+	else if (flags & BTRFS_BLOCK_GROUP_RAID0)
+		return BTRFS_RAID_RAID0;
+	else if (flags & BTRFS_BLOCK_GROUP_RAID5)
+		return BTRFS_RAID_RAID5;
+	else if (flags & BTRFS_BLOCK_GROUP_RAID6)
+		return BTRFS_RAID_RAID6;
+
+	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
+}
+
 const char *btrfs_bg_type_to_raid_name(u64 flags)
 {
 	const int index = btrfs_bg_flags_to_raid_index(flags);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 70c749eee3ad..b082250b42e0 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -566,32 +566,6 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
 	atomic_inc(&dev->dev_stats_ccnt);
 }
 
-/*
- * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
- * can be used as index to access btrfs_raid_array[].
- */
-static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags)
-{
-	if (flags & BTRFS_BLOCK_GROUP_RAID10)
-		return BTRFS_RAID_RAID10;
-	else if (flags & BTRFS_BLOCK_GROUP_RAID1)
-		return BTRFS_RAID_RAID1;
-	else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
-		return BTRFS_RAID_RAID1C3;
-	else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
-		return BTRFS_RAID_RAID1C4;
-	else if (flags & BTRFS_BLOCK_GROUP_DUP)
-		return BTRFS_RAID_DUP;
-	else if (flags & BTRFS_BLOCK_GROUP_RAID0)
-		return BTRFS_RAID_RAID0;
-	else if (flags & BTRFS_BLOCK_GROUP_RAID5)
-		return BTRFS_RAID_RAID5;
-	else if (flags & BTRFS_BLOCK_GROUP_RAID6)
-		return BTRFS_RAID_RAID6;
-
-	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
-}
-
 void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
 
 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void);
@@ -601,6 +575,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
 			       struct block_device *bdev,
 			       const char *device_path);
 
+enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags);
 int btrfs_bg_type_to_factor(u64 flags);
 const char *btrfs_bg_type_to_raid_name(u64 flags);
 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
-- 
cgit v1.2.3


From fe4f46d40c1c2ff78a8a7280e455f115c32e6b41 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 26 Jul 2021 14:15:21 +0200
Subject: btrfs: merge alloc_device helpers

The device allocation is split to two functions, but one just calls the
other and they're very far in the file. Merge them together.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 66 +++++++++++++++++++++---------------------------------
 1 file changed, 25 insertions(+), 41 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 40d850a7beba..23159bd29c6d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -430,44 +430,6 @@ void __exit btrfs_cleanup_fs_uuids(void)
 	}
 }
 
-/*
- * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
- * Returned struct is not linked onto any lists and must be destroyed using
- * btrfs_free_device.
- */
-static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
-{
-	struct btrfs_device *dev;
-
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-	if (!dev)
-		return ERR_PTR(-ENOMEM);
-
-	/*
-	 * Preallocate a bio that's always going to be used for flushing device
-	 * barriers and matches the device lifespan
-	 */
-	dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
-	if (!dev->flush_bio) {
-		kfree(dev);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	INIT_LIST_HEAD(&dev->dev_list);
-	INIT_LIST_HEAD(&dev->dev_alloc_list);
-	INIT_LIST_HEAD(&dev->post_commit_list);
-
-	atomic_set(&dev->reada_in_flight, 0);
-	atomic_set(&dev->dev_stats_ccnt, 0);
-	btrfs_device_data_ordered_init(dev);
-	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
-	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
-	extent_io_tree_init(fs_info, &dev->alloc_state,
-			    IO_TREE_DEVICE_ALLOC_STATE, NULL);
-
-	return dev;
-}
-
 static noinline struct btrfs_fs_devices *find_fsid(
 		const u8 *fsid, const u8 *metadata_fsid)
 {
@@ -6857,9 +6819,31 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 	if (WARN_ON(!devid && !fs_info))
 		return ERR_PTR(-EINVAL);
 
-	dev = __alloc_device(fs_info);
-	if (IS_ERR(dev))
-		return dev;
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * Preallocate a bio that's always going to be used for flushing device
+	 * barriers and matches the device lifespan
+	 */
+	dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
+	if (!dev->flush_bio) {
+		kfree(dev);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	INIT_LIST_HEAD(&dev->dev_list);
+	INIT_LIST_HEAD(&dev->dev_alloc_list);
+	INIT_LIST_HEAD(&dev->post_commit_list);
+
+	atomic_set(&dev->reada_in_flight, 0);
+	atomic_set(&dev->dev_stats_ccnt, 0);
+	btrfs_device_data_ordered_init(dev);
+	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+	extent_io_tree_init(fs_info, &dev->alloc_state,
+			    IO_TREE_DEVICE_ALLOC_STATE, NULL);
 
 	if (devid)
 		tmp = *devid;
-- 
cgit v1.2.3


From d58ede8d1d9fb0f70d6aa51fa6550d2d580f8c17 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 26 Jul 2021 14:15:24 +0200
Subject: btrfs: simplify data stripe calculation helpers

There are two helpers doing the same calculations based on nparity and
ncopies. calc_data_stripes can be simplified into one expression, so far
we don't have profile with both copies and parity, so there's no
effective change. calc_stripe_length should reuse the helper and not
repeat the same calculation.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 23159bd29c6d..c7339f163bda 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3568,10 +3568,7 @@ static u64 calc_data_stripes(u64 type, int num_stripes)
 	const int ncopies = btrfs_raid_array[index].ncopies;
 	const int nparity = btrfs_raid_array[index].nparity;
 
-	if (nparity)
-		return num_stripes - nparity;
-	else
-		return num_stripes / ncopies;
+	return (num_stripes - nparity) / ncopies;
 }
 
 /* [pstart, pend) */
@@ -6879,15 +6876,7 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
 
 static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
 {
-	int index = btrfs_bg_flags_to_raid_index(type);
-	int ncopies = btrfs_raid_array[index].ncopies;
-	const int nparity = btrfs_raid_array[index].nparity;
-	int data_stripes;
-
-	if (nparity)
-		data_stripes = num_stripes - nparity;
-	else
-		data_stripes = num_stripes / ncopies;
+	const int data_stripes = calc_data_stripes(type, num_stripes);
 
 	return div_u64(chunk_len, data_stripes);
 }
-- 
cgit v1.2.3


From 214cc184321743327c84c4a13ad08d088dfb3c4a Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 26 Jul 2021 14:15:26 +0200
Subject: btrfs: constify and cleanup variables in comparators

Comparators just read the data and thus get const parameters. This
should be also preserved by the local variables, update all comparators
passed to sort or bsearch.

Cleanups:

- unnecessary casts are dropped
- btrfs_cmp_device_free_bytes is cleaned up to follow the common pattern
  and 'inline' is dropped as the function address is taken

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c   |  8 ++++----
 fs/btrfs/send.c     |  6 +++---
 fs/btrfs/super.c    | 13 ++++++-------
 fs/btrfs/tree-log.c |  2 +-
 fs/btrfs/volumes.c  |  2 +-
 5 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index a40a45a007d4..d8d268ca8aa7 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1636,10 +1636,10 @@ struct btrfs_plug_cb {
 static int plug_cmp(void *priv, const struct list_head *a,
 		    const struct list_head *b)
 {
-	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
-						 plug_list);
-	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
-						 plug_list);
+	const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
+						       plug_list);
+	const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
+						       plug_list);
 	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
 	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
 
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 6ac37ae6c811..75cff564dedf 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1198,7 +1198,7 @@ struct backref_ctx {
 static int __clone_root_cmp_bsearch(const void *key, const void *elt)
 {
 	u64 root = (u64)(uintptr_t)key;
-	struct clone_root *cr = (struct clone_root *)elt;
+	const struct clone_root *cr = elt;
 
 	if (root < cr->root->root_key.objectid)
 		return -1;
@@ -1209,8 +1209,8 @@ static int __clone_root_cmp_bsearch(const void *key, const void *elt)
 
 static int __clone_root_cmp_sort(const void *e1, const void *e2)
 {
-	struct clone_root *cr1 = (struct clone_root *)e1;
-	struct clone_root *cr2 = (struct clone_root *)e2;
+	const struct clone_root *cr1 = e1;
+	const struct clone_root *cr2 = e2;
 
 	if (cr1->root->root_key.objectid < cr2->root->root_key.objectid)
 		return -1;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d07b18b2b250..35ff142ad242 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2096,16 +2096,15 @@ restore:
 }
 
 /* Used to sort the devices by max_avail(descending sort) */
-static inline int btrfs_cmp_device_free_bytes(const void *dev_info1,
-				       const void *dev_info2)
+static int btrfs_cmp_device_free_bytes(const void *a, const void *b)
 {
-	if (((struct btrfs_device_info *)dev_info1)->max_avail >
-	    ((struct btrfs_device_info *)dev_info2)->max_avail)
+	const struct btrfs_device_info *dev_info1 = a;
+	const struct btrfs_device_info *dev_info2 = b;
+
+	if (dev_info1->max_avail > dev_info2->max_avail)
 		return -1;
-	else if (((struct btrfs_device_info *)dev_info1)->max_avail <
-		 ((struct btrfs_device_info *)dev_info2)->max_avail)
+	else if (dev_info1->max_avail < dev_info2->max_avail)
 		return 1;
-	else
 	return 0;
 }
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8dde5c08a48f..191dea1d2416 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4191,7 +4191,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 static int extent_cmp(void *priv, const struct list_head *a,
 		      const struct list_head *b)
 {
-	struct extent_map *em1, *em2;
+	const struct extent_map *em1, *em2;
 
 	em1 = list_entry(a, struct extent_map, list);
 	em2 = list_entry(b, struct extent_map, list);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c7339f163bda..d42fb61aadc3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1216,7 +1216,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
 static int devid_cmp(void *priv, const struct list_head *a,
 		     const struct list_head *b)
 {
-	struct btrfs_device *dev1, *dev2;
+	const struct btrfs_device *dev1, *dev2;
 
 	dev1 = list_entry(a, struct btrfs_device, dev_list);
 	dev2 = list_entry(b, struct btrfs_device, dev_list);
-- 
cgit v1.2.3


From 4c37a7938496756649096a7ec26320eb8b0d90fb Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 26 Jul 2021 14:34:50 +0800
Subject: btrfs: reset this_bio_flag to avoid inheriting old flags

In btrfs_do_readpage(), we never reset @this_bio_flag after we hit a
compressed extent.

This is fine, as for PAGE_SIZE == sectorsize case, we can only have one
sector for one page, thus @this_bio_flag will only be set at most once.

But for subpage case, after hitting a compressed extent, @this_bio_flag
will always have EXTENT_BIO_COMPRESSED bit, even we're reading a regular
extent.

This will lead to various read errors, and causing new ASSERT() in
incoming subpage patches, which adds more strict check in
btrfs_submit_compressed_read().

Fix it by declaring @this_bio_flag inside the main loop and reset its
value for each iteration.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 19ba5b03b2df..4ee10669ed6b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3487,7 +3487,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 	size_t pg_offset = 0;
 	size_t iosize;
 	size_t blocksize = inode->i_sb->s_blocksize;
-	unsigned long this_bio_flag = 0;
 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 
 	ret = set_page_extent_mapped(page);
@@ -3518,6 +3517,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 	}
 	begin_page_read(fs_info, page);
 	while (cur <= end) {
+		unsigned long this_bio_flag = 0;
 		bool force_bio_submit = false;
 		u64 disk_bytenr;
 
-- 
cgit v1.2.3


From 3670e6451bc9c39ab3a46f1da19360219e4319f3 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 26 Jul 2021 14:34:51 +0800
Subject: btrfs: subpage: check if there are compressed extents inside one page

[BUG]
When testing experimental subpage compressed write support, it hits a
NULL pointer dereference inside read path:

 Unable to handle kernel NULL pointer dereference at virtual address 0000000000000018
 pc : __pi_memcmp+0x28/0x1ec
 lr : check_data_csum+0xd0/0x274 [btrfs]
 Call trace:
  __pi_memcmp+0x28/0x1ec
  btrfs_verify_data_csum+0xf4/0x244 [btrfs]
  end_bio_extent_readpage+0x1d0/0x6b0 [btrfs]
  bio_endio+0x15c/0x1dc
  end_workqueue_fn+0x44/0x64 [btrfs]
  btrfs_work_helper+0x74/0x250 [btrfs]
  process_one_work+0x1d4/0x47c
  worker_thread+0x180/0x400
  kthread+0x11c/0x120
  ret_from_fork+0x10/0x30
 Code: 54000261 d100044c d343fd8c f8408403 (f8408424)
 ---[ end trace 9e2c59f33ea40866 ]---

[CAUSE]
When reading two compressed extents inside the same page, like the
following layout, we trigger above crash:

	0	32K	64K
	|-------|\\\\\\\|
	     |	     \- Compressed extent (A)
	     \--------- Compressed extent (B)

For compressed read, we don't need to populate its io_bio->csum, as we
rely on compressed_bio->csum to verify the compressed data, and then
copy the decompressed to inode pages.

Normally btrfs_verify_data_csum() skip such page by checking and
clearing its PageChecked flag

But since that flag is still for the full page, when endio for inode
page range [0, 32K) gets executed, it clears PageChecked flag for the
full page.

Then when endio for inode page range [32K, 64K) gets executed, since the
page no longer has PageChecked flag, it just continues checking, even
though io_bio->csum is NULL.

[FIX]
Thankfully there are only two users of PageChecked bit:

- Cow fixup
  Since subpage has its own way to trace page dirty (dirty_bitmap) and
  ordered bit (ordered_bitmap), it should never trigger cow fixup.

- Compressed read
  We can distinguish such read by just checking io_bio->csum.

So just check io_bio->csum before doing the verification to avoid such
NULL pointer dereference.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 83caf69295f7..ae9e4ad1949a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3256,6 +3256,20 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
 		return 0;
 	}
 
+	/*
+	 * For subpage case, above PageChecked is not safe as it's not subpage
+	 * compatible.
+	 * But for now only cow fixup and compressed read utilize PageChecked
+	 * flag, while in this context we can easily use io_bio->csum to
+	 * determine if we really need to do csum verification.
+	 *
+	 * So for now, just exit if io_bio->csum is NULL, as it means it's
+	 * compressed read, and its compressed data csum has already been
+	 * verified.
+	 */
+	if (io_bio->csum == NULL)
+		return 0;
+
 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
 		return 0;
 
-- 
cgit v1.2.3


From ca62e85ded2c02f5e5d469103f2a5b6b09883dcc Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 26 Jul 2021 14:34:52 +0800
Subject: btrfs: disable compressed readahead for subpage

For current subpage support, we only support 64K page size with 4K
sector size.

This makes compressed readahead less effective, as maximum compressed
extent size is only 128K, 2x the page size.

On the other hand, the function add_ra_bio_pages() is still assuming
sectorsize == PAGE_SIZE, and code change may affect 4K page size
systems.

So for now, let's disable subpage compressed readahead for now.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index aeda426b6121..18579b904d5c 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -564,6 +564,16 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 	if (isize == 0)
 		return 0;
 
+	/*
+	 * For current subpage support, we only support 64K page size,
+	 * which means maximum compressed extent size (128K) is just 2x page
+	 * size.
+	 * This makes readahead less effective, so here disable readahead for
+	 * subpage for now, until full compressed write is supported.
+	 */
+	if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE)
+		return 0;
+
 	end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
 
 	while (last_offset < compressed_end) {
-- 
cgit v1.2.3


From 557023ea9f06baf2659b232b08b8e8711f7001a6 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 5 Jul 2021 10:00:56 +0800
Subject: btrfs: grab correct extent map for subpage compressed extent read

[BUG]
When subpage compressed read write support is enabled, btrfs/038 always
fails with EIO.

A simplified script can easily trigger the problem:

  mkfs.btrfs -f -s 4k $dev
  mount $dev $mnt -o compress=lzo

  xfs_io -f -c "truncate 118811" $mnt/foo
  xfs_io -c "pwrite -S 0x0d -b 39987 92267 39987" $mnt/foo > /dev/null

  sync
  btrfs subvolume snapshot -r $mnt $mnt/mysnap1

  xfs_io -c "pwrite -S 0x3e -b 80000 200000 80000" $mnt/foo > /dev/null
  sync

  xfs_io -c "pwrite -S 0xdc -b 10000 250000 10000" $mnt/foo > /dev/null
  xfs_io -c "pwrite -S 0xff -b 10000 300000 10000" $mnt/foo > /dev/null

  sync
  btrfs subvolume snapshot -r $mnt $mnt/mysnap2

  cat $mnt/mysnap2/foo
  # Above cat will fail due to EIO

[CAUSE]
The problem is in btrfs_submit_compressed_read().

When it tries to grab the extent map of the read range, it uses the
following call:

	em = lookup_extent_mapping(em_tree,
  				   page_offset(bio_first_page_all(bio)),
				   fs_info->sectorsize);

The problem is in the page_offset(bio_first_page_all(bio)) part.

The offending inode has the following file extent layout

        item 10 key (257 EXTENT_DATA 131072) itemoff 15639 itemsize 53
                generation 8 type 1 (regular)
                extent data disk byte 13680640 nr 4096
                extent data offset 0 nr 4096 ram 4096
                extent compression 0 (none)
        item 11 key (257 EXTENT_DATA 135168) itemoff 15586 itemsize 53
                generation 8 type 1 (regular)
                extent data disk byte 0 nr 0
        item 12 key (257 EXTENT_DATA 196608) itemoff 15533 itemsize 53
                generation 8 type 1 (regular)
                extent data disk byte 13676544 nr 4096
                extent data offset 0 nr 53248 ram 86016
                extent compression 2 (lzo)

And the bio passed in has the following parameters:

page_offset(bio_first_page_all(bio))	= 131072
bio_first_bvec_all(bio)->bv_offset	= 65536

If we use page_offset(bio_first_page_all(bio) without adding bv_offset,
we will get an extent map for file offset 131072, not 196608.

This means we read uncompressed data from disk, and later decompression
will definitely fail.

[FIX]
Take bv_offset into consideration when trying to grab an extent map.

And add an ASSERT() to ensure we're really getting a compressed extent.

Thankfully this won't affect anything but subpage, thus we only need to
ensure this patch get merged before we enabled basic subpage support.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 18579b904d5c..41ee862470b0 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -682,6 +682,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	struct page *page;
 	struct bio *comp_bio;
 	u64 cur_disk_byte = bio->bi_iter.bi_sector << 9;
+	u64 file_offset;
 	u64 em_len;
 	u64 em_start;
 	struct extent_map *em;
@@ -691,15 +692,17 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 	em_tree = &BTRFS_I(inode)->extent_tree;
 
+	file_offset = bio_first_bvec_all(bio)->bv_offset +
+		      page_offset(bio_first_page_all(bio));
+
 	/* we need the actual starting offset of this extent in the file */
 	read_lock(&em_tree->lock);
-	em = lookup_extent_mapping(em_tree,
-				   page_offset(bio_first_page_all(bio)),
-				   fs_info->sectorsize);
+	em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
 	read_unlock(&em_tree->lock);
 	if (!em)
 		return BLK_STS_IOERR;
 
+	ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
 	compressed_len = em->block_len;
 	cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
 	if (!cb)
-- 
cgit v1.2.3


From 1c3dc1731ed2b3757b25533c5245926ffc94f7dc Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 5 Jul 2021 10:00:58 +0800
Subject: btrfs: rework btrfs_decompress_buf2page()

There are several bugs inside the function btrfs_decompress_buf2page()

- @start_byte doesn't take bvec.bv_offset into consideration
  Thus it can't handle case where the target range is not page aligned.

- Too many helper variables
  There are tons of helper variables, @buf_offset, @current_buf_start,
  @start_byte, @prev_start_byte, @working_bytes, @bytes.
  This hurts anyone who wants to read the function.

- No obvious main cursor for the iteartion
  A new problem caused by previous problem.

- Comments for parameter list makes no sense
  Like @buf_start is the offset to @buf, or offset inside the full
  decompressed extent? (Spoiler alert, the later case)
  And @total_out acts more like @buf_start + @size_of_buf.

  The worst is @disk_start.
  The real meaning of it is the file offset of the full decompressed
  extent.

This patch will rework the whole function by:

- Add a proper comment with ASCII art to explain the parameter list

- Rework parameter list
  The old @buf_start is renamed to @decompressed, to show how many bytes
  are already decompressed inside the full decompressed extent.
  The old @total_out is replaced by @buf_len, which is the decompressed
  data size.
  For old @disk_start and @bio, just pass @compressed_bio in.

- Use single main cursor
  The main cursor will be @cur_file_offset, to show what's the current
  file offset.
  Other helper variables will be declared inside the main loop, and only
  minimal amount of helper variables:
  * offset_inside_decompressed_buf:	The only real helper
  * copy_start_file_offset:		File offset we start memcpy
  * bvec_file_offset:			File offset of current bvec

Even with all these extensive comments, the final function is still
smaller than the original function, which is definitely a win.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 144 ++++++++++++++++++++++---------------------------
 fs/btrfs/compression.h |   5 +-
 fs/btrfs/lzo.c         |   8 ++-
 fs/btrfs/zlib.c        |  12 ++---
 fs/btrfs/zstd.c        |   6 +--
 5 files changed, 76 insertions(+), 99 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 41ee862470b0..7869ad12bc6e 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1272,96 +1272,82 @@ void __cold btrfs_exit_compress(void)
 }
 
 /*
- * Copy uncompressed data from working buffer to pages.
+ * Copy decompressed data from working buffer to pages.
  *
- * buf_start is the byte offset we're of the start of our workspace buffer.
+ * @buf:		The decompressed data buffer
+ * @buf_len:		The decompressed data length
+ * @decompressed:	Number of bytes that are already decompressed inside the
+ * 			compressed extent
+ * @cb:			The compressed extent descriptor
+ * @orig_bio:		The original bio that the caller wants to read for
  *
- * total_out is the last byte of the buffer
+ * An easier to understand graph is like below:
+ *
+ * 		|<- orig_bio ->|     |<- orig_bio->|
+ * 	|<-------      full decompressed extent      ----->|
+ * 	|<-----------    @cb range   ---->|
+ * 	|			|<-- @buf_len -->|
+ * 	|<--- @decompressed --->|
+ *
+ * Note that, @cb can be a subpage of the full decompressed extent, but
+ * @cb->start always has the same as the orig_file_offset value of the full
+ * decompressed extent.
+ *
+ * When reading compressed extent, we have to read the full compressed extent,
+ * while @orig_bio may only want part of the range.
+ * Thus this function will ensure only data covered by @orig_bio will be copied
+ * to.
+ *
+ * Return 0 if we have copied all needed contents for @orig_bio.
+ * Return >0 if we need continue decompress.
  */
-int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
-			      unsigned long total_out, u64 disk_start,
-			      struct bio *bio)
+int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
+			      struct compressed_bio *cb, u32 decompressed)
 {
-	unsigned long buf_offset;
-	unsigned long current_buf_start;
-	unsigned long start_byte;
-	unsigned long prev_start_byte;
-	unsigned long working_bytes = total_out - buf_start;
-	unsigned long bytes;
-	struct bio_vec bvec = bio_iter_iovec(bio, bio->bi_iter);
-
-	/*
-	 * start byte is the first byte of the page we're currently
-	 * copying into relative to the start of the compressed data.
-	 */
-	start_byte = page_offset(bvec.bv_page) - disk_start;
-
-	/* we haven't yet hit data corresponding to this page */
-	if (total_out <= start_byte)
-		return 1;
-
-	/*
-	 * the start of the data we care about is offset into
-	 * the middle of our working buffer
-	 */
-	if (total_out > start_byte && buf_start < start_byte) {
-		buf_offset = start_byte - buf_start;
-		working_bytes -= buf_offset;
-	} else {
-		buf_offset = 0;
-	}
-	current_buf_start = buf_start;
-
-	/* copy bytes from the working buffer into the pages */
-	while (working_bytes > 0) {
-		bytes = min_t(unsigned long, bvec.bv_len,
-				PAGE_SIZE - (buf_offset % PAGE_SIZE));
-		bytes = min(bytes, working_bytes);
+	struct bio *orig_bio = cb->orig_bio;
+	/* Offset inside the full decompressed extent */
+	u32 cur_offset;
+
+	cur_offset = decompressed;
+	/* The main loop to do the copy */
+	while (cur_offset < decompressed + buf_len) {
+		struct bio_vec bvec;
+		size_t copy_len;
+		u32 copy_start;
+		/* Offset inside the full decompressed extent */
+		u32 bvec_offset;
+
+		bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter);
+		/*
+		 * cb->start may underflow, but subtracting that value can still
+		 * give us correct offset inside the full decompressed extent.
+		 */
+		bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start;
 
-		memcpy_to_page(bvec.bv_page, bvec.bv_offset, buf + buf_offset,
-			       bytes);
-		flush_dcache_page(bvec.bv_page);
+		/* Haven't reached the bvec range, exit */
+		if (decompressed + buf_len <= bvec_offset)
+			return 1;
 
-		buf_offset += bytes;
-		working_bytes -= bytes;
-		current_buf_start += bytes;
-
-		/* check if we need to pick another page */
-		bio_advance(bio, bytes);
-		if (!bio->bi_iter.bi_size)
-			return 0;
-		bvec = bio_iter_iovec(bio, bio->bi_iter);
-		prev_start_byte = start_byte;
-		start_byte = page_offset(bvec.bv_page) - disk_start;
+		copy_start = max(cur_offset, bvec_offset);
+		copy_len = min(bvec_offset + bvec.bv_len,
+			       decompressed + buf_len) - copy_start;
+		ASSERT(copy_len);
 
 		/*
-		 * We need to make sure we're only adjusting
-		 * our offset into compression working buffer when
-		 * we're switching pages.  Otherwise we can incorrectly
-		 * keep copying when we were actually done.
+		 * Extra range check to ensure we didn't go beyond
+		 * @buf + @buf_len.
 		 */
-		if (start_byte != prev_start_byte) {
-			/*
-			 * make sure our new page is covered by this
-			 * working buffer
-			 */
-			if (total_out <= start_byte)
-				return 1;
+		ASSERT(copy_start - decompressed < buf_len);
+		memcpy_to_page(bvec.bv_page, bvec.bv_offset,
+			       buf + copy_start - decompressed, copy_len);
+		flush_dcache_page(bvec.bv_page);
+		cur_offset += copy_len;
 
-			/*
-			 * the next page in the biovec might not be adjacent
-			 * to the last page, but it might still be found
-			 * inside this working buffer. bump our offset pointer
-			 */
-			if (total_out > start_byte &&
-			    current_buf_start < start_byte) {
-				buf_offset = start_byte - buf_start;
-				working_bytes = total_out - start_byte;
-				current_buf_start = buf_start + buf_offset;
-			}
-		}
+		bio_advance(orig_bio, copy_len);
+		/* Finished the bio */
+		if (!orig_bio->bi_iter.bi_size)
+			return 0;
 	}
-
 	return 1;
 }
 
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index c359f20920d0..399be0b435bf 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -86,9 +86,8 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
 			 unsigned long *total_out);
 int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
 		     unsigned long start_byte, size_t srclen, size_t destlen);
-int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
-			      unsigned long total_out, u64 disk_start,
-			      struct bio *bio);
+int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
+			      struct compressed_bio *cb, u32 decompressed);
 
 blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
 				  unsigned int len, u64 disk_start,
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 576a0e6142ad..6cab15e52cec 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -293,8 +293,6 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 	unsigned long tot_len;
 	char *buf;
 	struct page **pages_in = cb->compressed_pages;
-	u64 disk_start = cb->start;
-	struct bio *orig_bio = cb->orig_bio;
 
 	data_in = page_address(pages_in[0]);
 	tot_len = read_compress_length(data_in);
@@ -391,14 +389,14 @@ cont:
 		buf_start = tot_out;
 		tot_out += out_len;
 
-		ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
-						 tot_out, disk_start, orig_bio);
+		ret2 = btrfs_decompress_buf2page(workspace->buf, out_len,
+						 cb, buf_start);
 		if (ret2 == 0)
 			break;
 	}
 done:
 	if (!ret)
-		zero_fill_bio(orig_bio);
+		zero_fill_bio(cb->orig_bio);
 	return ret;
 }
 
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 5e18d7ad75a4..8afa90074891 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -275,8 +275,6 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 	unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
 	unsigned long buf_start;
 	struct page **pages_in = cb->compressed_pages;
-	u64 disk_start = cb->start;
-	struct bio *orig_bio = cb->orig_bio;
 
 	data_in = page_address(pages_in[page_in_index]);
 	workspace->strm.next_in = data_in;
@@ -314,9 +312,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 		if (buf_start == total_out)
 			break;
 
-		ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
-						 total_out, disk_start,
-						 orig_bio);
+		ret2 = btrfs_decompress_buf2page(workspace->buf,
+				total_out - buf_start, cb, buf_start);
 		if (ret2 == 0) {
 			ret = 0;
 			goto done;
@@ -336,8 +333,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 			data_in = page_address(pages_in[page_in_index]);
 			workspace->strm.next_in = data_in;
 			tmp = srclen - workspace->strm.total_in;
-			workspace->strm.avail_in = min(tmp,
-							   PAGE_SIZE);
+			workspace->strm.avail_in = min(tmp, PAGE_SIZE);
 		}
 	}
 	if (ret != Z_STREAM_END)
@@ -347,7 +343,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 done:
 	zlib_inflateEnd(&workspace->strm);
 	if (!ret)
-		zero_fill_bio(orig_bio);
+		zero_fill_bio(cb->orig_bio);
 	return ret;
 }
 
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 200ba08bfae6..56dce9f00988 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -540,8 +540,6 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 {
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
 	struct page **pages_in = cb->compressed_pages;
-	u64 disk_start = cb->start;
-	struct bio *orig_bio = cb->orig_bio;
 	size_t srclen = cb->compressed_len;
 	ZSTD_DStream *stream;
 	int ret = 0;
@@ -582,7 +580,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 		workspace->out_buf.pos = 0;
 
 		ret = btrfs_decompress_buf2page(workspace->out_buf.dst,
-				buf_start, total_out, disk_start, orig_bio);
+				total_out - buf_start, cb, buf_start);
 		if (ret == 0)
 			break;
 
@@ -607,7 +605,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 		}
 	}
 	ret = 0;
-	zero_fill_bio(orig_bio);
+	zero_fill_bio(cb->orig_bio);
 done:
 	return ret;
 }
-- 
cgit v1.2.3


From a6e66e6f8c1b685e11b778bef614480a9c1a5278 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 26 Jul 2021 14:34:55 +0800
Subject: btrfs: rework lzo_decompress_bio() to make it subpage compatible

For the initial subpage support, although we won't support compressed
write, we still need to support compressed read.

But for lzo_decompress_bio() it has several problems:

- The abuse of PAGE_SIZE for boundary detection
  For subpage case, we should follow sectorsize to detect the padding
  zeros.
  Using PAGE_SIZE will cause subpage compress read to skip certain
  bytes, and causing read error.

- Too many helper variables
  There are half a dozen helper variables, which is only making things
  harder to read

This patch will rework lzo_decompress_bio() to make it work for subpage:

- Use sectorsize to do boundary check, while still use PAGE_SIZE for
  page switching
  This allows us to have the same on-disk format for 4K sectorsize fs,
  while take advantage of larger page size.

- Use two main cursors
  Only @cur_in and @cur_out is utilized as the main cursor.
  The helper variables will only be declared inside the loop, and only 2
  helper variables needed.

- Introduce a helper function to copy compressed segment payload
  Introduce a new helper, copy_compressed_segment(), to copy a
  compressed segment to workspace buffer.
  This function will handle the page switching.

Now the net result is, with all the excessive comments and new helper
function, the refactored code is still smaller, and easier to read.

For other decompression code, they have no special padding rule, thus no
need to bother for initial subpage support, but will be refactored to
the same style later.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/lzo.c | 196 ++++++++++++++++++++++++++-------------------------------
 1 file changed, 90 insertions(+), 106 deletions(-)

diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 6cab15e52cec..c25dfd1a8a54 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -14,6 +14,7 @@
 #include <linux/lzo.h>
 #include <linux/refcount.h>
 #include "compression.h"
+#include "ctree.h"
 
 #define LZO_LEN	4
 
@@ -271,130 +272,113 @@ out:
 	return ret;
 }
 
+/*
+ * Copy the compressed segment payload into @dest.
+ *
+ * For the payload there will be no padding, just need to do page switching.
+ */
+static void copy_compressed_segment(struct compressed_bio *cb,
+				    char *dest, u32 len, u32 *cur_in)
+{
+	u32 orig_in = *cur_in;
+
+	while (*cur_in < orig_in + len) {
+		struct page *cur_page;
+		u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
+					  orig_in + len - *cur_in);
+
+		ASSERT(copy_len);
+		cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE];
+
+		memcpy(dest + *cur_in - orig_in,
+			page_address(cur_page) + offset_in_page(*cur_in),
+			copy_len);
+
+		*cur_in += copy_len;
+	}
+}
+
 int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 {
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
-	int ret = 0, ret2;
-	char *data_in;
-	unsigned long page_in_index = 0;
-	size_t srclen = cb->compressed_len;
-	unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
-	unsigned long buf_start;
-	unsigned long buf_offset = 0;
-	unsigned long bytes;
-	unsigned long working_bytes;
-	size_t in_len;
-	size_t out_len;
-	const size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE);
-	unsigned long in_offset;
-	unsigned long in_page_bytes_left;
-	unsigned long tot_in;
-	unsigned long tot_out;
-	unsigned long tot_len;
-	char *buf;
-	struct page **pages_in = cb->compressed_pages;
+	const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+	const u32 sectorsize = fs_info->sectorsize;
+	int ret;
+	/* Compressed data length, can be unaligned */
+	u32 len_in;
+	/* Offset inside the compressed data */
+	u32 cur_in = 0;
+	/* Bytes decompressed so far */
+	u32 cur_out = 0;
+
+	len_in = read_compress_length(page_address(cb->compressed_pages[0]));
+	cur_in += LZO_LEN;
 
-	data_in = page_address(pages_in[0]);
-	tot_len = read_compress_length(data_in);
 	/*
-	 * Compressed data header check.
+	 * LZO header length check
 	 *
-	 * The real compressed size can't exceed the maximum extent length, and
-	 * all pages should be used (whole unused page with just the segment
-	 * header is not possible).  If this happens it means the compressed
-	 * extent is corrupted.
+	 * The total length should not exceed the maximum extent length,
+	 * and all sectors should be used.
+	 * If this happens, it means the compressed extent is corrupted.
 	 */
-	if (tot_len > min_t(size_t, BTRFS_MAX_COMPRESSED, srclen) ||
-	    tot_len < srclen - PAGE_SIZE) {
-		ret = -EUCLEAN;
-		goto done;
+	if (len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, cb->compressed_len) ||
+	    round_up(len_in, sectorsize) < cb->compressed_len) {
+		btrfs_err(fs_info,
+			"invalid lzo header, lzo len %u compressed len %u",
+			len_in, cb->compressed_len);
+		return -EUCLEAN;
 	}
 
-	tot_in = LZO_LEN;
-	in_offset = LZO_LEN;
-	in_page_bytes_left = PAGE_SIZE - LZO_LEN;
-
-	tot_out = 0;
-
-	while (tot_in < tot_len) {
-		in_len = read_compress_length(data_in + in_offset);
-		in_page_bytes_left -= LZO_LEN;
-		in_offset += LZO_LEN;
-		tot_in += LZO_LEN;
+	/* Go through each lzo segment */
+	while (cur_in < len_in) {
+		struct page *cur_page;
+		/* Length of the compressed segment */
+		u32 seg_len;
+		u32 sector_bytes_left;
+		size_t out_len = lzo1x_worst_compress(sectorsize);
 
 		/*
-		 * Segment header check.
-		 *
-		 * The segment length must not exceed the maximum LZO
-		 * compression size, nor the total compressed size.
+		 * We should always have enough space for one segment header
+		 * inside current sector.
 		 */
-		if (in_len > max_segment_len || tot_in + in_len > tot_len) {
-			ret = -EUCLEAN;
-			goto done;
-		}
-
-		tot_in += in_len;
-		working_bytes = in_len;
-
-		/* fast path: avoid using the working buffer */
-		if (in_page_bytes_left >= in_len) {
-			buf = data_in + in_offset;
-			bytes = in_len;
-			goto cont;
-		}
-
-		/* copy bytes from the pages into the working buffer */
-		buf = workspace->cbuf;
-		buf_offset = 0;
-		while (working_bytes) {
-			bytes = min(working_bytes, in_page_bytes_left);
-
-			memcpy(buf + buf_offset, data_in + in_offset, bytes);
-			buf_offset += bytes;
-cont:
-			working_bytes -= bytes;
-			in_page_bytes_left -= bytes;
-			in_offset += bytes;
-
-			/* check if we need to pick another page */
-			if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
-			    || in_page_bytes_left == 0) {
-				tot_in += in_page_bytes_left;
-
-				if (working_bytes == 0 && tot_in >= tot_len)
-					break;
-
-				if (page_in_index + 1 >= total_pages_in) {
-					ret = -EIO;
-					goto done;
-				}
-
-				page_in_index++;
-				data_in = page_address(pages_in[page_in_index]);
-
-				in_page_bytes_left = PAGE_SIZE;
-				in_offset = 0;
-			}
-		}
-
-		out_len = max_segment_len;
-		ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
-					    &out_len);
+		ASSERT(cur_in / sectorsize ==
+		       (cur_in + LZO_LEN - 1) / sectorsize);
+		cur_page = cb->compressed_pages[cur_in / PAGE_SIZE];
+		ASSERT(cur_page);
+		seg_len = read_compress_length(page_address(cur_page) +
+					       offset_in_page(cur_in));
+		cur_in += LZO_LEN;
+
+		/* Copy the compressed segment payload into workspace */
+		copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in);
+
+		/* Decompress the data */
+		ret = lzo1x_decompress_safe(workspace->cbuf, seg_len,
+					    workspace->buf, &out_len);
 		if (ret != LZO_E_OK) {
-			pr_warn("BTRFS: decompress failed\n");
+			btrfs_err(fs_info, "failed to decompress");
 			ret = -EIO;
-			break;
+			goto out;
 		}
 
-		buf_start = tot_out;
-		tot_out += out_len;
+		/* Copy the data into inode pages */
+		ret = btrfs_decompress_buf2page(workspace->buf, out_len, cb, cur_out);
+		cur_out += out_len;
 
-		ret2 = btrfs_decompress_buf2page(workspace->buf, out_len,
-						 cb, buf_start);
-		if (ret2 == 0)
-			break;
+		/* All data read, exit */
+		if (ret == 0)
+			goto out;
+		ret = 0;
+
+		/* Check if the sector has enough space for a segment header */
+		sector_bytes_left = sectorsize - (cur_in % sectorsize);
+		if (sector_bytes_left >= LZO_LEN)
+			continue;
+
+		/* Skip the padding zeros */
+		cur_in += sector_bytes_left;
 	}
-done:
+out:
 	if (!ret)
 		zero_fill_bio(cb->orig_bio);
 	return ret;
-- 
cgit v1.2.3


From f47960f49e59b9d77bd2919c3513dbbe088c3908 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 26 Jul 2021 14:34:56 +0800
Subject: btrfs: reloc: factor out relocation page read and dirty part

In function relocate_file_extent_cluster(), we have a big loop for
marking all involved page delalloc.

That part is long enough to be contained in one function, so this patch
will move that code chunk into a new function, relocate_one_page().

This also provides enough space for later subpage work.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/relocation.c | 198 ++++++++++++++++++++++++--------------------------
 1 file changed, 93 insertions(+), 105 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index fc831597cb22..e28c3b936937 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2886,19 +2886,102 @@ noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
 }
 ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
 
-static int relocate_file_extent_cluster(struct inode *inode,
-					struct file_extent_cluster *cluster)
+static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
+			     struct file_extent_cluster *cluster,
+			     int *cluster_nr, unsigned long page_index)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	u64 offset = BTRFS_I(inode)->index_cnt;
+	const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+	struct page *page;
 	u64 page_start;
 	u64 page_end;
+	int ret;
+
+	ASSERT(page_index <= last_index);
+	ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), PAGE_SIZE);
+	if (ret)
+		return ret;
+
+	page = find_lock_page(inode->i_mapping, page_index);
+	if (!page) {
+		page_cache_sync_readahead(inode->i_mapping, ra, NULL,
+				page_index, last_index + 1 - page_index);
+		page = find_or_create_page(inode->i_mapping, page_index, mask);
+		if (!page) {
+			ret = -ENOMEM;
+			goto release_delalloc;
+		}
+	}
+	ret = set_page_extent_mapped(page);
+	if (ret < 0)
+		goto release_page;
+
+	if (PageReadahead(page))
+		page_cache_async_readahead(inode->i_mapping, ra, NULL, page,
+				   page_index, last_index + 1 - page_index);
+
+	if (!PageUptodate(page)) {
+		btrfs_readpage(NULL, page);
+		lock_page(page);
+		if (!PageUptodate(page)) {
+			ret = -EIO;
+			goto release_page;
+		}
+	}
+
+	page_start = page_offset(page);
+	page_end = page_start + PAGE_SIZE - 1;
+
+	lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
+
+	if (*cluster_nr < cluster->nr &&
+	    page_start + offset == cluster->boundary[*cluster_nr]) {
+		set_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
+				EXTENT_BOUNDARY);
+		(*cluster_nr)++;
+	}
+
+	ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, page_end,
+					0, NULL);
+	if (ret) {
+		clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start,
+				  page_end, EXTENT_LOCKED | EXTENT_BOUNDARY);
+		goto release_page;
+
+	}
+	set_page_dirty(page);
+
+	unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
+	unlock_page(page);
+	put_page(page);
+
+	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+	balance_dirty_pages_ratelimited(inode->i_mapping);
+	btrfs_throttle(fs_info);
+	if (btrfs_should_cancel_balance(fs_info))
+		ret = -ECANCELED;
+	return ret;
+
+release_page:
+	unlock_page(page);
+	put_page(page);
+release_delalloc:
+	btrfs_delalloc_release_metadata(BTRFS_I(inode), PAGE_SIZE, true);
+	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+	return ret;
+}
+
+static int relocate_file_extent_cluster(struct inode *inode,
+					struct file_extent_cluster *cluster)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	u64 offset = BTRFS_I(inode)->index_cnt;
 	unsigned long index;
 	unsigned long last_index;
-	struct page *page;
 	struct file_ra_state *ra;
-	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
-	int nr = 0;
+	int cluster_nr = 0;
 	int ret = 0;
 
 	if (!cluster->nr)
@@ -2919,109 +3002,14 @@ static int relocate_file_extent_cluster(struct inode *inode,
 	if (ret)
 		goto out;
 
-	index = (cluster->start - offset) >> PAGE_SHIFT;
 	last_index = (cluster->end - offset) >> PAGE_SHIFT;
-	while (index <= last_index) {
-		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
-				PAGE_SIZE);
-		if (ret)
-			goto out;
-
-		page = find_lock_page(inode->i_mapping, index);
-		if (!page) {
-			page_cache_sync_readahead(inode->i_mapping,
-						  ra, NULL, index,
-						  last_index + 1 - index);
-			page = find_or_create_page(inode->i_mapping, index,
-						   mask);
-			if (!page) {
-				btrfs_delalloc_release_metadata(BTRFS_I(inode),
-							PAGE_SIZE, true);
-				btrfs_delalloc_release_extents(BTRFS_I(inode),
-							PAGE_SIZE);
-				ret = -ENOMEM;
-				goto out;
-			}
-		}
-		ret = set_page_extent_mapped(page);
-		if (ret < 0) {
-			btrfs_delalloc_release_metadata(BTRFS_I(inode),
-							PAGE_SIZE, true);
-			btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
-			unlock_page(page);
-			put_page(page);
-			goto out;
-		}
-
-		if (PageReadahead(page)) {
-			page_cache_async_readahead(inode->i_mapping,
-						   ra, NULL, page, index,
-						   last_index + 1 - index);
-		}
-
-		if (!PageUptodate(page)) {
-			btrfs_readpage(NULL, page);
-			lock_page(page);
-			if (!PageUptodate(page)) {
-				unlock_page(page);
-				put_page(page);
-				btrfs_delalloc_release_metadata(BTRFS_I(inode),
-							PAGE_SIZE, true);
-				btrfs_delalloc_release_extents(BTRFS_I(inode),
-							       PAGE_SIZE);
-				ret = -EIO;
-				goto out;
-			}
-		}
-
-		page_start = page_offset(page);
-		page_end = page_start + PAGE_SIZE - 1;
-
-		lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
-
-		if (nr < cluster->nr &&
-		    page_start + offset == cluster->boundary[nr]) {
-			set_extent_bits(&BTRFS_I(inode)->io_tree,
-					page_start, page_end,
-					EXTENT_BOUNDARY);
-			nr++;
-		}
-
-		ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start,
-						page_end, 0, NULL);
-		if (ret) {
-			unlock_page(page);
-			put_page(page);
-			btrfs_delalloc_release_metadata(BTRFS_I(inode),
-							 PAGE_SIZE, true);
-			btrfs_delalloc_release_extents(BTRFS_I(inode),
-			                               PAGE_SIZE);
-
-			clear_extent_bits(&BTRFS_I(inode)->io_tree,
-					  page_start, page_end,
-					  EXTENT_LOCKED | EXTENT_BOUNDARY);
-			goto out;
-
-		}
-		set_page_dirty(page);
-
-		unlock_extent(&BTRFS_I(inode)->io_tree,
-			      page_start, page_end);
-		unlock_page(page);
-		put_page(page);
-
-		index++;
-		btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
-		balance_dirty_pages_ratelimited(inode->i_mapping);
-		btrfs_throttle(fs_info);
-		if (btrfs_should_cancel_balance(fs_info)) {
-			ret = -ECANCELED;
-			goto out;
-		}
-	}
-	WARN_ON(nr != cluster->nr);
+	for (index = (cluster->start - offset) >> PAGE_SHIFT;
+	     index <= last_index && !ret; index++)
+		ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index);
 	if (btrfs_is_zoned(fs_info) && !ret)
 		ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+	if (ret == 0)
+		WARN_ON(cluster_nr != cluster->nr);
 out:
 	kfree(ra);
 	return ret;
-- 
cgit v1.2.3


From c2832898126fc320a0e2915b07bf8924cf54770e Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 26 Jul 2021 14:34:57 +0800
Subject: btrfs: make relocate_one_page() handle subpage case

For subpage case, one page of data reloc inode can contain several file
extents, like this:

|<--- File extent A --->| FE B | FE C |<--- File extent D -->|
		|<--------- Page --------->|

We can no longer use PAGE_SIZE directly for various operations.

This patch will relocate_one_page() to handle subpage case by:
- Iterating through all extents of a cluster when marking pages
  When marking pages dirty and delalloc, we need to check the cluster
  extent boundary.
  Now we introduce a loop to go extent by extent of a page, until we
  either finished the last extent, or reach the page end.

  By this, regular sectorsize == PAGE_SIZE can still work as usual, since
  we will do that loop only once.

- Iteration start from max(page_start, extent_start)
  Since we can have the following case:
			| FE B | FE C |<--- File extent D -->|
		|<--------- Page --------->|
  Thus we can't always start from page_start, but do a
  max(page_start, extent_start)

- Iteration end when the cluster is exhausted
  Similar to previous case, the last file extent can end before the page
  end:
|<--- File extent A --->| FE B | FE C |
		|<--------- Page --------->|
  In this case, we need to manually exit the loop after we have finished
  the last extent of the cluster.

- Reserve metadata space for each extent range
  Since now we can hit multiple ranges in one page, we should reserve
  metadata for each range, not simply PAGE_SIZE.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/relocation.c | 106 ++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 77 insertions(+), 29 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e28c3b936937..9c8cea5cabe4 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -24,6 +24,7 @@
 #include "block-group.h"
 #include "backref.h"
 #include "misc.h"
+#include "subpage.h"
 
 /*
  * Relocation overview
@@ -2886,6 +2887,17 @@ noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
 }
 ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
 
+static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster,
+				    int cluster_nr)
+{
+	/* Last extent, use cluster end directly */
+	if (cluster_nr >= cluster->nr - 1)
+		return cluster->end;
+
+	/* Use next boundary start*/
+	return cluster->boundary[cluster_nr + 1] - 1;
+}
+
 static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
 			     struct file_extent_cluster *cluster,
 			     int *cluster_nr, unsigned long page_index)
@@ -2897,22 +2909,17 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
 	struct page *page;
 	u64 page_start;
 	u64 page_end;
+	u64 cur;
 	int ret;
 
 	ASSERT(page_index <= last_index);
-	ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), PAGE_SIZE);
-	if (ret)
-		return ret;
-
 	page = find_lock_page(inode->i_mapping, page_index);
 	if (!page) {
 		page_cache_sync_readahead(inode->i_mapping, ra, NULL,
 				page_index, last_index + 1 - page_index);
 		page = find_or_create_page(inode->i_mapping, page_index, mask);
-		if (!page) {
-			ret = -ENOMEM;
-			goto release_delalloc;
-		}
+		if (!page)
+			return -ENOMEM;
 	}
 	ret = set_page_extent_mapped(page);
 	if (ret < 0)
@@ -2934,30 +2941,74 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
 	page_start = page_offset(page);
 	page_end = page_start + PAGE_SIZE - 1;
 
-	lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
-
-	if (*cluster_nr < cluster->nr &&
-	    page_start + offset == cluster->boundary[*cluster_nr]) {
-		set_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
-				EXTENT_BOUNDARY);
-		(*cluster_nr)++;
-	}
+	/*
+	 * Start from the cluster, as for subpage case, the cluster can start
+	 * inside the page.
+	 */
+	cur = max(page_start, cluster->boundary[*cluster_nr] - offset);
+	while (cur <= page_end) {
+		u64 extent_start = cluster->boundary[*cluster_nr] - offset;
+		u64 extent_end = get_cluster_boundary_end(cluster,
+						*cluster_nr) - offset;
+		u64 clamped_start = max(page_start, extent_start);
+		u64 clamped_end = min(page_end, extent_end);
+		u32 clamped_len = clamped_end + 1 - clamped_start;
+
+		/* Reserve metadata for this range */
+		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
+						      clamped_len);
+		if (ret)
+			goto release_page;
 
-	ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, page_end,
-					0, NULL);
-	if (ret) {
-		clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start,
-				  page_end, EXTENT_LOCKED | EXTENT_BOUNDARY);
-		goto release_page;
+		/* Mark the range delalloc and dirty for later writeback */
+		lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end);
+		ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
+						clamped_end, 0, NULL);
+		if (ret) {
+			clear_extent_bits(&BTRFS_I(inode)->io_tree,
+					clamped_start, clamped_end,
+					EXTENT_LOCKED | EXTENT_BOUNDARY);
+			btrfs_delalloc_release_metadata(BTRFS_I(inode),
+							clamped_len, true);
+			btrfs_delalloc_release_extents(BTRFS_I(inode),
+						       clamped_len);
+			goto release_page;
+		}
+		btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len);
 
+		/*
+		 * Set the boundary if it's inside the page.
+		 * Data relocation requires the destination extents to have the
+		 * same size as the source.
+		 * EXTENT_BOUNDARY bit prevents current extent from being merged
+		 * with previous extent.
+		 */
+		if (in_range(cluster->boundary[*cluster_nr] - offset,
+			     page_start, PAGE_SIZE)) {
+			u64 boundary_start = cluster->boundary[*cluster_nr] -
+						offset;
+			u64 boundary_end = boundary_start +
+					   fs_info->sectorsize - 1;
+
+			set_extent_bits(&BTRFS_I(inode)->io_tree,
+					boundary_start, boundary_end,
+					EXTENT_BOUNDARY);
+		}
+		unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end);
+		btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
+		cur += clamped_len;
+
+		/* Crossed extent end, go to next extent */
+		if (cur >= extent_end) {
+			(*cluster_nr)++;
+			/* Just finished the last extent of the cluster, exit. */
+			if (*cluster_nr >= cluster->nr)
+				break;
+		}
 	}
-	set_page_dirty(page);
-
-	unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
 	unlock_page(page);
 	put_page(page);
 
-	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
 	balance_dirty_pages_ratelimited(inode->i_mapping);
 	btrfs_throttle(fs_info);
 	if (btrfs_should_cancel_balance(fs_info))
@@ -2967,9 +3018,6 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
 release_page:
 	unlock_page(page);
 	put_page(page);
-release_delalloc:
-	btrfs_delalloc_release_metadata(BTRFS_I(inode), PAGE_SIZE, true);
-	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
 	return ret;
 }
 
-- 
cgit v1.2.3


From cc1d0d93d55ac12e7faee9acfcd7c28c8b86cf89 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 26 Jul 2021 14:34:58 +0800
Subject: btrfs: subpage: fix writeback which does not have ordered extent

[BUG]
When running fsstress with subpage RW support, there are random
BUG_ON()s triggered with the following trace:

 kernel BUG at fs/btrfs/file-item.c:667!
 Internal error: Oops - BUG: 0 [#1] SMP
 CPU: 1 PID: 3486 Comm: kworker/u13:2 5.11.0-rc4-custom+ #43
 Hardware name: Radxa ROCK Pi 4B (DT)
 Workqueue: btrfs-worker-high btrfs_work_helper [btrfs]
 pstate: 60000005 (nZCv daif -PAN -UAO -TCO BTYPE=--)
 pc : btrfs_csum_one_bio+0x420/0x4e0 [btrfs]
 lr : btrfs_csum_one_bio+0x400/0x4e0 [btrfs]
 Call trace:
  btrfs_csum_one_bio+0x420/0x4e0 [btrfs]
  btrfs_submit_bio_start+0x20/0x30 [btrfs]
  run_one_async_start+0x28/0x44 [btrfs]
  btrfs_work_helper+0x128/0x1b4 [btrfs]
  process_one_work+0x22c/0x430
  worker_thread+0x70/0x3a0
  kthread+0x13c/0x140
  ret_from_fork+0x10/0x30

[CAUSE]
Above BUG_ON() means there is some bio range which doesn't have ordered
extent, which indeed is worth a BUG_ON().

Unlike regular sectorsize == PAGE_SIZE case, in subpage we have extra
subpage dirty bitmap to record which range is dirty and should be
written back.

This means, if we submit bio for a subpage range, we do not only need to
clear page dirty, but also need to clear subpage dirty bits.

In __extent_writepage_io(), we will call btrfs_page_clear_dirty() for
any range we submit a bio.

But there is loophole, if we hit a range which is beyond i_size, we just
call btrfs_writepage_endio_finish_ordered() to finish the ordered io,
then break out, without clearing the subpage dirty.

This means, if we hit above branch, the subpage dirty bits are still
there, if other range of the page get dirtied and we need to writeback
that page again, we will submit bio for the old range, leaving a wild
bio range which doesn't have ordered extent.

[FIX]
Fix it by always calling btrfs_page_clear_dirty() in
__extent_writepage_io().

Also to avoid such problem from happening again, add a new assert,
btrfs_page_assert_not_dirty(), to make sure both page dirty and subpage
dirty bits are cleared before exiting __extent_writepage_io().

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 16 ++++++++++++++++
 fs/btrfs/subpage.c   | 20 ++++++++++++++++++++
 fs/btrfs/subpage.h   |  3 +++
 3 files changed, 39 insertions(+)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4ee10669ed6b..e1a464f6b42f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3864,6 +3864,15 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 		if (cur >= i_size) {
 			btrfs_writepage_endio_finish_ordered(inode, page, cur,
 							     end, true);
+			/*
+			 * This range is beyond i_size, thus we don't need to
+			 * bother writing back.
+			 * But we still need to clear the dirty subpage bit, or
+			 * the next time the page gets dirtied, we will try to
+			 * writeback the sectors with subpage dirty bits,
+			 * causing writeback without ordered extent.
+			 */
+			btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur);
 			break;
 		}
 
@@ -3914,6 +3923,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 			else
 				btrfs_writepage_endio_finish_ordered(inode,
 						page, cur, cur + iosize - 1, true);
+			btrfs_page_clear_dirty(fs_info, page, cur, iosize);
 			cur += iosize;
 			continue;
 		}
@@ -3949,6 +3959,12 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 		cur += iosize;
 		nr++;
 	}
+	/*
+	 * If we finish without problem, we should not only clear page dirty,
+	 * but also empty subpage dirty bits
+	 */
+	if (!ret)
+		btrfs_page_assert_not_dirty(fs_info, page);
 	*nr_ret = nr;
 	return ret;
 }
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 640bcd21bf28..3c311841cdee 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -559,3 +559,23 @@ IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
 			 PageWriteback);
 IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
 			 PageOrdered);
+
+/*
+ * Make sure not only the page dirty bit is cleared, but also subpage dirty bit
+ * is cleared.
+ */
+void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
+				 struct page *page)
+{
+	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+
+	if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
+		return;
+
+	ASSERT(!PageDirty(page));
+	if (fs_info->sectorsize == PAGE_SIZE)
+		return;
+
+	ASSERT(PagePrivate(page) && page->private);
+	ASSERT(subpage->dirty_bitmap == 0);
+}
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 4d7aca85d915..0120948f37a1 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -126,4 +126,7 @@ DECLARE_BTRFS_SUBPAGE_OPS(ordered);
 bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
 		struct page *page, u64 start, u32 len);
 
+void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
+				 struct page *page);
+
 #endif
-- 
cgit v1.2.3


From 7367253a351ef7202d215d3145d7e83e1472be7d Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 26 Jul 2021 14:34:59 +0800
Subject: btrfs: subpage: disable inline extent creation

[BUG]
When running the following fsx command (extracted from generic/127) on
subpage filesystem, it can create inline extent with regular extents:

  fsx -q -l 262144 -o 65536 -S 191110531 -N 9057 -R -W $mnt/file > /tmp/fsx

The offending extent would look like:

  item 9 key (257 INODE_REF 256) itemoff 15703 itemsize 14
    index 2 namelen 4 name: file
  item 10 key (257 EXTENT_DATA 0) itemoff 14975 itemsize 728
    generation 7 type 0 (inline)
    inline extent data size 707 ram_bytes 707 compression 0 (none)
  item 11 key (257 EXTENT_DATA 4096) itemoff 14922 itemsize 53
    generation 7 type 2 (prealloc)
    prealloc data disk byte 102346752 nr 4096
    prealloc data offset 0 nr 4096

[CAUSE]
For subpage filesystem, the writeback is triggered in page units, which
means, even if we just want to writeback range [16K, 20K) for 64K page
system, we will still try to writeback any dirty sector of range [0, 64K).

This is never a problem if sectorsize == PAGE_SIZE, but for subpage,
this can cause unexpected problems.

For above test case, the last several operations from fsx are:

 9055 trunc      from 0x40000 to 0x2c3
 9057 falloc     from 0x164c to 0x19d2 (0x386 bytes)

In operation 9055, we dirtied sector [0, 4096), then in falloc, we call
btrfs_wait_ordered_range(inode, start=4096, len=4096), only expecting to
writeback any dirty data in [4096, 8192), but nothing else.

Unfortunately, in subpage case, above btrfs_wait_ordered_range() will
trigger writeback of the range [0, 64K), which includes the data at
[0, 4096).

And since at the call site, we haven't yet increased i_size, which is
still 707, this means cow_file_range() can insert an inline extent.

Resulting above inline + regular extent.

[WORKAROUND]
I don't really have any good short-term solution yet, as this means all
operations that would trigger writeback need to be reviewed for any
i_size change.

So here I choose to disable inline extent creation for subpage case as a
workaround.  We have done tons of work just to avoid such extent, so I
don't to create an exception just for subpage.

This only affects inline extent creation, subpage has no problem reading
existing inline extents at all.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ae9e4ad1949a..cade9a2d561e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -681,7 +681,11 @@ again:
 		}
 	}
 cont:
-	if (start == 0) {
+	/*
+	 * Check cow_file_range() for why we don't even try to create inline
+	 * extent for subpage case.
+	 */
+	if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
 		/* lets try to make an inline extent */
 		if (ret || total_in < actual_end) {
 			/* we didn't compress the entire range, try
@@ -1079,7 +1083,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
 
 	inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
 
-	if (start == 0) {
+	/*
+	 * Due to the page size limit, for subpage we can only trigger the
+	 * writeback for the dirty sectors of page, that means data writeback
+	 * is doing more writeback than what we want.
+	 *
+	 * This is especially unexpected for some call sites like fallocate,
+	 * where we only increase i_size after everything is done.
+	 * This means we can trigger inline extent even if we didn't want to.
+	 * So here we skip inline extent creation completely.
+	 */
+	if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
 		/* lets try to make an inline extent */
 		ret = cow_file_range_inline(inode, start, end, 0,
 					    BTRFS_COMPRESS_NONE, NULL);
-- 
cgit v1.2.3


From e0eefe07f89516f57dac3fbb759b75a5f0a19ae4 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 26 Jul 2021 14:35:00 +0800
Subject: btrfs: subpage: allow submit_extent_page() to do bio split

Current submit_extent_page() just checks if the current page range can
be fitted into current bio, and if not, submit then re-add.

But this behavior can't handle subpage case at all.

For subpage case, the problem is in the page size, 64K, which is also
the same size as stripe size.

This means, if we can't fit a full 64K into a bio, due to stripe limit,
then it won't fit into next bio without crossing stripe either.

The proper way to handle it is:

- Check how many bytes we can be put into current bio
- Put as many bytes as possible into current bio first
- Submit current bio
- Create a new bio
- Add the remaining bytes into the new bio

Refactor submit_extent_page() so that it does the above iteration.

The main loop inside submit_extent_page() will look like this:

	cur = pg_offset;
	while (cur < pg_offset + size) {
		u32 offset = cur - pg_offset;
		int added;
		if (!bio_ctrl->bio) {
			/* Allocate new bio if needed */
		}

		/* Add as many bytes into the bio */
		added = btrfs_bio_add_page();

		if (added < size - offset) {
			/* The current bio is full, submit it */
		}
		cur += added;
	}

Also, since we're doing new bio allocation deep inside the main loop,
extract that code into a new helper, alloc_new_bio().

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 190 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 131 insertions(+), 59 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e1a464f6b42f..5843f4948431 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -172,6 +172,8 @@ int __must_check submit_one_bio(struct bio *bio, int mirror_num,
 
 	bio->bi_private = NULL;
 
+	/* Caller should ensure the bio has at least some range added */
+	ASSERT(bio->bi_iter.bi_size);
 	if (is_data_inode(tree->private_data))
 		ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
 					    bio_flags);
@@ -3181,20 +3183,22 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
  * @size:	portion of page that we want to write
  * @prev_bio_flags:  flags of previous bio to see if we can merge the current one
  * @bio_flags:	flags of the current bio to see if we can merge them
- * @return:	true if page was added, false otherwise
  *
  * Attempt to add a page to bio considering stripe alignment etc.
  *
- * Return true if successfully page added. Otherwise, return false.
+ * Return >= 0 for the number of bytes added to the bio.
+ * Can return 0 if the current bio is already at stripe/zone boundary.
+ * Return <0 for error.
  */
-static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
-			       struct page *page,
-			       u64 disk_bytenr, unsigned int size,
-			       unsigned int pg_offset,
-			       unsigned long bio_flags)
+static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
+			      struct page *page,
+			      u64 disk_bytenr, unsigned int size,
+			      unsigned int pg_offset,
+			      unsigned long bio_flags)
 {
 	struct bio *bio = bio_ctrl->bio;
 	u32 bio_size = bio->bi_iter.bi_size;
+	u32 real_size;
 	const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
 	bool contig;
 	int ret;
@@ -3203,25 +3207,32 @@ static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
 	/* The limit should be calculated when bio_ctrl->bio is allocated */
 	ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
 	if (bio_ctrl->bio_flags != bio_flags)
-		return false;
+		return 0;
 
 	if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED)
 		contig = bio->bi_iter.bi_sector == sector;
 	else
 		contig = bio_end_sector(bio) == sector;
 	if (!contig)
-		return false;
+		return 0;
 
-	if (bio_size + size > bio_ctrl->len_to_oe_boundary ||
-	    bio_size + size > bio_ctrl->len_to_stripe_boundary)
-		return false;
+	real_size = min(bio_ctrl->len_to_oe_boundary,
+			bio_ctrl->len_to_stripe_boundary) - bio_size;
+	real_size = min(real_size, size);
+
+	/*
+	 * If real_size is 0, never call bio_add_*_page(), as even size is 0,
+	 * bio will still execute its endio function on the page!
+	 */
+	if (real_size == 0)
+		return 0;
 
 	if (bio_op(bio) == REQ_OP_ZONE_APPEND)
-		ret = bio_add_zone_append_page(bio, page, size, pg_offset);
+		ret = bio_add_zone_append_page(bio, page, real_size, pg_offset);
 	else
-		ret = bio_add_page(bio, page, size, pg_offset);
+		ret = bio_add_page(bio, page, real_size, pg_offset);
 
-	return ret == size;
+	return ret;
 }
 
 static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
@@ -3279,6 +3290,62 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
 	return 0;
 }
 
+static int alloc_new_bio(struct btrfs_inode *inode,
+			 struct btrfs_bio_ctrl *bio_ctrl,
+			 struct writeback_control *wbc,
+			 unsigned int opf,
+			 bio_end_io_t end_io_func,
+			 u64 disk_bytenr, u32 offset,
+			 unsigned long bio_flags)
+{
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	struct bio *bio;
+	int ret;
+
+	/*
+	 * For compressed page range, its disk_bytenr is always @disk_bytenr
+	 * passed in, no matter if we have added any range into previous bio.
+	 */
+	if (bio_flags & EXTENT_BIO_COMPRESSED)
+		bio = btrfs_bio_alloc(disk_bytenr);
+	else
+		bio = btrfs_bio_alloc(disk_bytenr + offset);
+	bio_ctrl->bio = bio;
+	bio_ctrl->bio_flags = bio_flags;
+	ret = calc_bio_boundaries(bio_ctrl, inode);
+	if (ret < 0)
+		goto error;
+	bio->bi_end_io = end_io_func;
+	bio->bi_private = &inode->io_tree;
+	bio->bi_write_hint = inode->vfs_inode.i_write_hint;
+	bio->bi_opf = opf;
+	if (wbc) {
+		struct block_device *bdev;
+
+		bdev = fs_info->fs_devices->latest_bdev;
+		bio_set_dev(bio, bdev);
+		wbc_init_bio(wbc, bio);
+	}
+	if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
+		struct btrfs_device *device;
+
+		device = btrfs_zoned_get_device(fs_info, disk_bytenr,
+						fs_info->sectorsize);
+		if (IS_ERR(device)) {
+			ret = PTR_ERR(device);
+			goto error;
+		}
+
+		btrfs_io_bio(bio)->device = device;
+	}
+	return 0;
+error:
+	bio_ctrl->bio = NULL;
+	bio->bi_status = errno_to_blk_status(ret);
+	bio_endio(bio);
+	return ret;
+}
+
 /*
  * @opf:	bio REQ_OP_* and REQ_* flags as one value
  * @wbc:	optional writeback control for io accounting
@@ -3304,61 +3371,66 @@ static int submit_extent_page(unsigned int opf,
 			      bool force_bio_submit)
 {
 	int ret = 0;
-	struct bio *bio;
-	size_t io_size = min_t(size_t, size, PAGE_SIZE);
 	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
-	struct extent_io_tree *tree = &inode->io_tree;
-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	unsigned int cur = pg_offset;
 
 	ASSERT(bio_ctrl);
 
 	ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
 	       pg_offset + size <= PAGE_SIZE);
-	if (bio_ctrl->bio) {
-		bio = bio_ctrl->bio;
-		if (force_bio_submit ||
-		    !btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, io_size,
-					pg_offset, bio_flags)) {
-			ret = submit_one_bio(bio, mirror_num, bio_ctrl->bio_flags);
+	if (force_bio_submit && bio_ctrl->bio) {
+		ret = submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->bio_flags);
+		bio_ctrl->bio = NULL;
+		if (ret < 0)
+			return ret;
+	}
+
+	while (cur < pg_offset + size) {
+		u32 offset = cur - pg_offset;
+		int added;
+
+		/* Allocate new bio if needed */
+		if (!bio_ctrl->bio) {
+			ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
+					    end_io_func, disk_bytenr, offset,
+					    bio_flags);
+			if (ret < 0)
+				return ret;
+		}
+		/*
+		 * We must go through btrfs_bio_add_page() to ensure each
+		 * page range won't cross various boundaries.
+		 */
+		if (bio_flags & EXTENT_BIO_COMPRESSED)
+			added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr,
+					size - offset, pg_offset + offset,
+					bio_flags);
+		else
+			added = btrfs_bio_add_page(bio_ctrl, page,
+					disk_bytenr + offset, size - offset,
+					pg_offset + offset, bio_flags);
+
+		/* Metadata page range should never be split */
+		if (!is_data_inode(&inode->vfs_inode))
+			ASSERT(added == 0 || added == size - offset);
+
+		/* At least we added some page, update the account */
+		if (wbc && added)
+			wbc_account_cgroup_owner(wbc, page, added);
+
+		/* We have reached boundary, submit right now */
+		if (added < size - offset) {
+			/* The bio should contain some page(s) */
+			ASSERT(bio_ctrl->bio->bi_iter.bi_size);
+			ret = submit_one_bio(bio_ctrl->bio, mirror_num,
+					bio_ctrl->bio_flags);
 			bio_ctrl->bio = NULL;
 			if (ret < 0)
 				return ret;
-		} else {
-			if (wbc)
-				wbc_account_cgroup_owner(wbc, page, io_size);
-			return 0;
 		}
+		cur += added;
 	}
-
-	bio = btrfs_bio_alloc(disk_bytenr);
-	bio_add_page(bio, page, io_size, pg_offset);
-	bio->bi_end_io = end_io_func;
-	bio->bi_private = tree;
-	bio->bi_write_hint = page->mapping->host->i_write_hint;
-	bio->bi_opf = opf;
-	if (wbc) {
-		struct block_device *bdev;
-
-		bdev = fs_info->fs_devices->latest_bdev;
-		bio_set_dev(bio, bdev);
-		wbc_init_bio(wbc, bio);
-		wbc_account_cgroup_owner(wbc, page, io_size);
-	}
-	if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
-		struct btrfs_device *device;
-
-		device = btrfs_zoned_get_device(fs_info, disk_bytenr, io_size);
-		if (IS_ERR(device))
-			return PTR_ERR(device);
-
-		btrfs_io_bio(bio)->device = device;
-	}
-
-	bio_ctrl->bio = bio;
-	bio_ctrl->bio_flags = bio_flags;
-	ret = calc_bio_boundaries(bio_ctrl, inode);
-
-	return ret;
+	return 0;
 }
 
 static int attach_extent_buffer_page(struct extent_buffer *eb,
-- 
cgit v1.2.3


From c8050b3b7f76586945003a8a2aeb2c8157f26645 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 26 Jul 2021 14:35:01 +0800
Subject: btrfs: subpage: reject raid56 filesystem and profile conversion

RAID56 is not only unsafe due to its write-hole problem, but also has
tons of hardcoded PAGE_SIZE.

Disable it for subpage support for now.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 10 ++++++++++
 fs/btrfs/volumes.c |  7 +++++++
 2 files changed, 17 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a59ab7b9aea0..b9ba244de1d1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3402,6 +3402,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 			goto fail_alloc;
 		}
 	}
+	if (sectorsize != PAGE_SIZE) {
+		if (btrfs_super_incompat_flags(fs_info->super_copy) &
+			BTRFS_FEATURE_INCOMPAT_RAID56) {
+			btrfs_err(fs_info,
+		"RAID56 is not yet supported for sector size %u with page size %lu",
+				sectorsize, PAGE_SIZE);
+			err = -EINVAL;
+			goto fail_alloc;
+		}
+	}
 
 	ret = btrfs_init_workqueues(fs_info, fs_devices);
 	if (ret) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d42fb61aadc3..8e61307ffad2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3968,6 +3968,13 @@ static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
 	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
 		return true;
 
+	if (fs_info->sectorsize < PAGE_SIZE &&
+		bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+		btrfs_err(fs_info,
+		"RAID56 is not yet supported for sectorsize %u with page size %lu",
+			  fs_info->sectorsize, PAGE_SIZE);
+		return false;
+	}
 	/* Profile is valid and does not have bits outside of the allowed set */
 	if (alloc_profile_is_valid(bargs->target, 1) &&
 	    (bargs->target & ~allowed) == 0)
-- 
cgit v1.2.3


From e0467866198f7f536806f39e5d0d91ae8018de08 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 26 Jul 2021 14:35:02 +0800
Subject: btrfs: subpage: fix race between prepare_pages() and
 btrfs_releasepage()

[BUG]
When running generic/095, there is a high chance to crash with subpage
data RW support:

 assertion failed: PagePrivate(page) && page->private
 ------------[ cut here ]------------
 kernel BUG at fs/btrfs/ctree.h:3403!
 Internal error: Oops - BUG: 0 [#1] SMP
 CPU: 1 PID: 3567 Comm: fio Tainted: 5.12.0-rc7-custom+ #17
 Hardware name: Khadas VIM3 (DT)
 Call trace:
  assertfail.constprop.0+0x28/0x2c [btrfs]
  btrfs_subpage_assert+0x80/0xa0 [btrfs]
  btrfs_subpage_set_uptodate+0x34/0xec [btrfs]
  btrfs_page_clamp_set_uptodate+0x74/0xa4 [btrfs]
  btrfs_dirty_pages+0x160/0x270 [btrfs]
  btrfs_buffered_write+0x444/0x630 [btrfs]
  btrfs_direct_write+0x1cc/0x2d0 [btrfs]
  btrfs_file_write_iter+0xc0/0x160 [btrfs]
  new_sync_write+0xe8/0x180
  vfs_write+0x1b4/0x210
  ksys_pwrite64+0x7c/0xc0
  __arm64_sys_pwrite64+0x24/0x30
  el0_svc_common.constprop.0+0x70/0x140
  do_el0_svc+0x28/0x90
  el0_svc+0x2c/0x54
  el0_sync_handler+0x1a8/0x1ac
  el0_sync+0x170/0x180
 Code: f0000160 913be042 913c4000 955444bc (d4210000)
 ---[ end trace 3fdd39f4cccedd68 ]---

[CAUSE]
Although prepare_pages() calls find_or_create_page(), which returns the
page locked, but in later prepare_uptodate_page() calls, we may call
btrfs_readpage() which will unlock the page before it returns.

This leaves a window where btrfs_releasepage() can sneak in and release
the page, clearing page->private and causing above ASSERT().

[FIX]
In prepare_uptodate_page(), we should not only check page->mapping, but
also PagePrivate() to ensure we are still holding the correct page which
has proper fs context setup.

Reported-by: Ritesh Harjani <riteshh@linux.ibm.com>
Tested-by: Ritesh Harjani <riteshh@linux.ibm.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ee34497500e1..8c57af3702fa 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1340,7 +1340,18 @@ static int prepare_uptodate_page(struct inode *inode,
 			unlock_page(page);
 			return -EIO;
 		}
-		if (page->mapping != inode->i_mapping) {
+
+		/*
+		 * Since btrfs_readpage() will unlock the page before it
+		 * returns, there is a window where btrfs_releasepage() can
+		 * be called to release the page.
+		 * Here we check both inode mapping and PagePrivate() to
+		 * make sure the page was not released.
+		 *
+		 * The private flag check is essential for subpage as we need
+		 * to store extra bitmap using page->private.
+		 */
+		if (page->mapping != inode->i_mapping || !PagePrivate(page)) {
 			unlock_page(page);
 			return -EAGAIN;
 		}
-- 
cgit v1.2.3


From 7c11d0ae439565b4560b0c0f36bf05171ed1a146 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 26 Jul 2021 14:35:03 +0800
Subject: btrfs: subpage: fix a potential use-after-free in writeback helper

[BUG]
There is a possible use-after-free bug when running generic/095.

 BUG: Unable to handle kernel data access on write at 0x6b6b6b6b6b6b725b
 Faulting instruction address: 0xc000000000283654
 c000000000283078 do_raw_spin_unlock+0x88/0x230
 c0000000012b1e14 _raw_spin_unlock_irqrestore+0x44/0x90
 c000000000a918dc btrfs_subpage_clear_writeback+0xac/0xe0
 c0000000009e0458 end_bio_extent_writepage+0x158/0x270
 c000000000b6fd14 bio_endio+0x254/0x270
 c0000000009fc0f0 btrfs_end_bio+0x1a0/0x200
 c000000000b6fd14 bio_endio+0x254/0x270
 c000000000b781fc blk_update_request+0x46c/0x670
 c000000000b8b394 blk_mq_end_request+0x34/0x1d0
 c000000000d82d1c lo_complete_rq+0x11c/0x140
 c000000000b880a4 blk_complete_reqs+0x84/0xb0
 c0000000012b2ca4 __do_softirq+0x334/0x680
 c0000000001dd878 irq_exit+0x148/0x1d0
 c000000000016f4c do_IRQ+0x20c/0x240
 c000000000009240 hardware_interrupt_common_virt+0x1b0/0x1c0

[CAUSE]
There is very small race window like the following in generic/095.

	Thread 1		|		Thread 2
--------------------------------+------------------------------------
  end_bio_extent_writepage()	| btrfs_releasepage()
  |- spin_lock_irqsave()	| |
  |- end_page_writeback()	| |
  |				| |- if (PageWriteback() ||...)
  |				| |- clear_page_extent_mapped()
  |				|    |- kfree(subpage);
  |- spin_unlock_irqrestore().

The race can also happen between writeback and btrfs_invalidatepage(),
although that would be much harder as btrfs_invalidatepage() has much
more work to do before the clear_page_extent_mapped() call.

[FIX]
Here we "wait" for the subapge spinlock to be released before we detach
subpage structure.
So this patch will introduce a new function, wait_subpage_spinlock(), to
do the "wait" by acquiring the spinlock and release it.

Since the caller has ensured the page is not dirty nor writeback, and
page is already locked, the only way to hold the subpage spinlock is
from endio function.
Thus we only need to acquire the spinlock to wait for any existing
holder.

Reported-by: Ritesh Harjani <riteshh@linux.ibm.com>
Tested-by: Ritesh Harjani <riteshh@linux.ibm.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c    |  8 ++++----
 fs/btrfs/inode.c   | 39 ++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/subpage.c |  4 +++-
 3 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8c57af3702fa..d3f2623a2af0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1343,10 +1343,10 @@ static int prepare_uptodate_page(struct inode *inode,
 
 		/*
 		 * Since btrfs_readpage() will unlock the page before it
-		 * returns, there is a window where btrfs_releasepage() can
-		 * be called to release the page.
-		 * Here we check both inode mapping and PagePrivate() to
-		 * make sure the page was not released.
+		 * returns, there is a window where btrfs_releasepage() can be
+		 * called to release the page.  Here we check both inode
+		 * mapping and PagePrivate() to make sure the page was not
+		 * released.
 		 *
 		 * The private flag check is essential for subpage as we need
 		 * to store extra bitmap using page->private.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index cade9a2d561e..034fe81db5c1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8426,11 +8426,47 @@ static void btrfs_readahead(struct readahead_control *rac)
 	extent_readahead(rac);
 }
 
+/*
+ * For releasepage() and invalidatepage() we have a race window where
+ * end_page_writeback() is called but the subpage spinlock is not yet released.
+ * If we continue to release/invalidate the page, we could cause use-after-free
+ * for subpage spinlock.  So this function is to spin and wait for subpage
+ * spinlock.
+ */
+static void wait_subpage_spinlock(struct page *page)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+	struct btrfs_subpage *subpage;
+
+	if (fs_info->sectorsize == PAGE_SIZE)
+		return;
+
+	ASSERT(PagePrivate(page) && page->private);
+	subpage = (struct btrfs_subpage *)page->private;
+
+	/*
+	 * This may look insane as we just acquire the spinlock and release it,
+	 * without doing anything.  But we just want to make sure no one is
+	 * still holding the subpage spinlock.
+	 * And since the page is not dirty nor writeback, and we have page
+	 * locked, the only possible way to hold a spinlock is from the endio
+	 * function to clear page writeback.
+	 *
+	 * Here we just acquire the spinlock so that all existing callers
+	 * should exit and we're safe to release/invalidate the page.
+	 */
+	spin_lock_irq(&subpage->lock);
+	spin_unlock_irq(&subpage->lock);
+}
+
 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 {
 	int ret = try_release_extent_mapping(page, gfp_flags);
-	if (ret == 1)
+
+	if (ret == 1) {
+		wait_subpage_spinlock(page);
 		clear_page_extent_mapped(page);
+	}
 	return ret;
 }
 
@@ -8494,6 +8530,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
 	 * do double ordered extent accounting on the same page.
 	 */
 	wait_on_page_writeback(page);
+	wait_subpage_spinlock(page);
 
 	/*
 	 * For subpage case, we have call sites like
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 3c311841cdee..cb10e56ee31e 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -435,8 +435,10 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
 
 	spin_lock_irqsave(&subpage->lock, flags);
 	subpage->writeback_bitmap &= ~tmp;
-	if (subpage->writeback_bitmap == 0)
+	if (subpage->writeback_bitmap == 0) {
+		ASSERT(PageWriteback(page));
 		end_page_writeback(page);
+	}
 	spin_unlock_irqrestore(&subpage->lock, flags);
 }
 
-- 
cgit v1.2.3


From e3c62324e470c0a89df966603156b34fccd01708 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 26 Jul 2021 14:35:04 +0800
Subject: btrfs: subpage: fix false alert when relocating partial preallocated
 data extents

[BUG]
When relocating partial preallocated data extents (part of the
preallocated extent is written) for subpage, it can cause the following
false alert and make the relocation to fail:

  BTRFS info (device dm-3): balance: start -d
  BTRFS info (device dm-3): relocating block group 13631488 flags data
  BTRFS warning (device dm-3): csum failed root -9 ino 257 off 4096 csum 0x98757625 expected csum 0x00000000 mirror 1
  BTRFS error (device dm-3): bdev /dev/mapper/arm_nvme-test errs: wr 0, rd 0, flush 0, corrupt 1, gen 0
  BTRFS warning (device dm-3): csum failed root -9 ino 257 off 4096 csum 0x98757625 expected csum 0x00000000 mirror 1
  BTRFS error (device dm-3): bdev /dev/mapper/arm_nvme-test errs: wr 0, rd 0, flush 0, corrupt 2, gen 0
  BTRFS info (device dm-3): balance: ended with status: -5

The minimal script to reproduce looks like this:

  mkfs.btrfs -f -s 4k $dev
  mount $dev -o nospace_cache $mnt
  xfs_io -f -c "falloc 0 8k" $mnt/file
  xfs_io -f -c "pwrite 0 4k" $mnt/file
  btrfs balance start -d $mnt

[CAUSE]
Function btrfs_verify_data_csum() checks if the full range has
EXTENT_NODATASUM bit for data reloc inode, if *all* bytes of the range
have EXTENT_NODATASUM bit, then it skip the range.

This works pretty well for regular sectorsize, as in that case
btrfs_verify_data_csum() is called for each sector, thus no problem at
all.

But for subpage case, btrfs_verify_data_csum() is called on each bvec,
which can contain several sectors, and since it checks *all* bytes for
EXTENT_NODATASUM bit, if we have some range with csum, then we will
continue checking all the sectors.

For the preallocated sectors, it doesn't have any csum, thus obviously
the csum won't match and cause the false alert.

[FIX]
Move the EXTENT_NODATASUM check into the main loop, so that we can check
each sector for EXTENT_NODATASUM bit for subpage case.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 034fe81db5c1..915f11462c7c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3290,19 +3290,24 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
 	if (!root->fs_info->csum_root)
 		return 0;
 
-	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
-	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
-		clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
-		return 0;
-	}
-
 	ASSERT(page_offset(page) <= start &&
 	       end <= page_offset(page) + PAGE_SIZE - 1);
 	for (pg_off = offset_in_page(start);
 	     pg_off < offset_in_page(end);
 	     pg_off += sectorsize, bio_offset += sectorsize) {
+		u64 file_offset = pg_off + page_offset(page);
 		int ret;
 
+		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+		    test_range_bit(io_tree, file_offset,
+				   file_offset + sectorsize - 1,
+				   EXTENT_NODATASUM, 1, NULL)) {
+			/* Skip the range without csum for data reloc inode */
+			clear_extent_bits(io_tree, file_offset,
+					  file_offset + sectorsize - 1,
+					  EXTENT_NODATASUM);
+			continue;
+		}
 		ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
 				      page_offset(page) + pg_off);
 		if (ret < 0) {
-- 
cgit v1.2.3


From 9d9ea1e68a05ef852d612f0c49d274c86e1e710a Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 26 Jul 2021 14:35:05 +0800
Subject: btrfs: subpage: fix relocation potentially overwriting last page data

[BUG]
When using the following script, btrfs will report data corruption after
one data balance with subpage support:

  mkfs.btrfs -f -s 4k $dev
  mount $dev -o nospace_cache $mnt
  $fsstress -w -n 8 -s 1620948986 -d $mnt/ -v > /tmp/fsstress
  sync
  btrfs balance start -d $mnt
  btrfs scrub start -B $mnt

Similar problem can be easily observed in btrfs/028 test case, there
will be tons of balance failure with -EIO.

[CAUSE]
Above fsstress will result the following data extents layout in extent
tree:
  item 10 key (13631488 EXTENT_ITEM 98304) itemoff 15889 itemsize 82
    refs 2 gen 7 flags DATA
    extent data backref root FS_TREE objectid 259 offset 1339392 count 1
    extent data backref root FS_TREE objectid 259 offset 647168 count 1
  item 11 key (13631488 BLOCK_GROUP_ITEM 8388608) itemoff 15865 itemsize 24
    block group used 102400 chunk_objectid 256 flags DATA
  item 12 key (13733888 EXTENT_ITEM 4096) itemoff 15812 itemsize 53
    refs 1 gen 7 flags DATA
    extent data backref root FS_TREE objectid 259 offset 729088 count 1

Then when creating the data reloc inode, the data reloc inode will look
like this:

	0	32K	64K	96K 100K	104K
	|<------ Extent A ----->|   |<- Ext B ->|

Then when we first try to relocate extent A, we setup the data reloc
inode with i_size 96K, then read both page [0, 64K) and page [64K, 128K).

For page 64K, since the i_size is just 96K, we fill range [96K, 128K)
with 0 and set it uptodate.

Then when we come to extent B, we update i_size to 104K, then try to read
page [64K, 128K).
Then we find the page is already uptodate, so we skip the read.
But range [96K, 128K) is filled with 0, not the real data.

Then we writeback the data reloc inode to disk, with 0 filling range
[96K, 128K), corrupting the content of extent B.

The behavior is caused by the fact that we still do full page read for
subpage case.

The bug won't really happen for regular sectorsize, as one page only
contains one sector.

[FIX]
This patch will fix the problem by invalidating range [i_size, PAGE_END]
in prealloc_file_extent_cluster().

So that if above example happens, when we preallocate the file extent
for extent B, we will clear the uptodate bits for range [96K, 128K),
allowing later relocate_one_page() to re-read the needed range.

There is a special note for the invalidating part.

Since we're not calling real btrfs_invalidatepage(), but just clearing
the subpage and page uptodate bits, we can leave a page half dirty and
half out of date.

Reading such page can cause a deadlock, as we normally expect a dirty
page to be fully uptodate.

Thus here we flush and wait the data reloc inode before doing the hacked
invalidating.  This won't cause extra overhead, as we're going to
writeback the data later anyway.

Reported-by: Ritesh Harjani <riteshh@linux.ibm.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/relocation.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 9c8cea5cabe4..914d403b4415 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2782,10 +2782,70 @@ static noinline_for_stack int prealloc_file_extent_cluster(
 	u64 num_bytes;
 	int nr;
 	int ret = 0;
+	u64 i_size = i_size_read(&inode->vfs_inode);
 	u64 prealloc_start = cluster->start - offset;
 	u64 prealloc_end = cluster->end - offset;
 	u64 cur_offset = prealloc_start;
 
+	/*
+	 * For subpage case, previous i_size may not be aligned to PAGE_SIZE.
+	 * This means the range [i_size, PAGE_END + 1) is filled with zeros by
+	 * btrfs_do_readpage() call of previously relocated file cluster.
+	 *
+	 * If the current cluster starts in the above range, btrfs_do_readpage()
+	 * will skip the read, and relocate_one_page() will later writeback
+	 * the padding zeros as new data, causing data corruption.
+	 *
+	 * Here we have to manually invalidate the range (i_size, PAGE_END + 1).
+	 */
+	if (!IS_ALIGNED(i_size, PAGE_SIZE)) {
+		struct address_space *mapping = inode->vfs_inode.i_mapping;
+		struct btrfs_fs_info *fs_info = inode->root->fs_info;
+		const u32 sectorsize = fs_info->sectorsize;
+		struct page *page;
+
+		ASSERT(sectorsize < PAGE_SIZE);
+		ASSERT(IS_ALIGNED(i_size, sectorsize));
+
+		/*
+		 * Subpage can't handle page with DIRTY but without UPTODATE
+		 * bit as it can lead to the following deadlock:
+		 *
+		 * btrfs_readpage()
+		 * | Page already *locked*
+		 * |- btrfs_lock_and_flush_ordered_range()
+		 *    |- btrfs_start_ordered_extent()
+		 *       |- extent_write_cache_pages()
+		 *          |- lock_page()
+		 *             We try to lock the page we already hold.
+		 *
+		 * Here we just writeback the whole data reloc inode, so that
+		 * we will be ensured to have no dirty range in the page, and
+		 * are safe to clear the uptodate bits.
+		 *
+		 * This shouldn't cause too much overhead, as we need to write
+		 * the data back anyway.
+		 */
+		ret = filemap_write_and_wait(mapping);
+		if (ret < 0)
+			return ret;
+
+		clear_extent_bits(&inode->io_tree, i_size,
+				  round_up(i_size, PAGE_SIZE) - 1,
+				  EXTENT_UPTODATE);
+		page = find_lock_page(mapping, i_size >> PAGE_SHIFT);
+		/*
+		 * If page is freed we don't need to do anything then, as we
+		 * will re-read the whole page anyway.
+		 */
+		if (page) {
+			btrfs_subpage_clear_uptodate(fs_info, page, i_size,
+					round_up(i_size, PAGE_SIZE) - i_size);
+			unlock_page(page);
+			put_page(page);
+		}
+	}
+
 	BUG_ON(cluster->start != cluster->boundary[0]);
 	ret = btrfs_alloc_data_chunk_ondemand(inode,
 					      prealloc_end + 1 - prealloc_start);
-- 
cgit v1.2.3


From 95ea0486b20e4de9011d04b05ed667201940b532 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 26 Jul 2021 14:35:06 +0800
Subject: btrfs: allow read-write for 4K sectorsize on 64K page size systems

Since now we support data and metadata read-write for subpage, remove
the RO requirement for subpage mount.

There are some extra limitations though:

- For now, subpage RW mount is still considered experimental
  Thus that mount warning will still be there.

- No compression support
  There are still quite some PAGE_SIZE hard coded and quite some call
  sites use extent_clear_unlock_delalloc() to unlock locked_page.
  This will screw up subpage helpers.

  Now for subpage RW mount, no matter what mount option or inode attr is
  set, all writes will not be compressed.  Although reading compressed
  data has no problem.

- No defrag for subpage case
  The defrag support for subpage case will come in later patches, which
  will also rework the defrag workflow.

- No inline extent will be created
  This is mostly due to the fact that filemap_fdatawrite_range() will
  trigger more write than the range specified.
  In fallocate calls, this behavior can make us to writeback which can
  be inlined, before we enlarge the i_size.

  This is a very special corner case, and even current btrfs check won't
  report error on such inline extent + regular extent.
  But considering how much effort has been put to prevent such inline +
  regular, I'd prefer to cut off inline extent completely until we have
  a good solution.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 13 ++++---------
 fs/btrfs/inode.c   |  3 +++
 fs/btrfs/ioctl.c   |  6 ++++++
 fs/btrfs/super.c   |  7 -------
 fs/btrfs/sysfs.c   |  4 ++++
 5 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b9ba244de1d1..2f9515dccce0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3392,15 +3392,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 		goto fail_alloc;
 	}
 
-	/* For 4K sector size support, it's only read-only */
-	if (PAGE_SIZE == SZ_64K && sectorsize == SZ_4K) {
-		if (!sb_rdonly(sb) || btrfs_super_log_root(disk_super)) {
-			btrfs_err(fs_info,
-	"subpage sectorsize %u only supported read-only for page size %lu",
-				sectorsize, PAGE_SIZE);
-			err = -EINVAL;
-			goto fail_alloc;
-		}
+	if (sectorsize != PAGE_SIZE) {
+		btrfs_warn(fs_info,
+		"read-write for sector size %u with page size %lu is experimental",
+			   sectorsize, PAGE_SIZE);
 	}
 	if (sectorsize != PAGE_SIZE) {
 		if (btrfs_super_incompat_flags(fs_info->super_copy) &
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 915f11462c7c..7e63b5c0a1dc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -489,6 +489,9 @@ static noinline int add_async_extent(struct async_chunk *cow,
  */
 static inline bool inode_can_compress(struct btrfs_inode *inode)
 {
+	/* Subpage doesn't support compression yet */
+	if (inode->root->fs_info->sectorsize < PAGE_SIZE)
+		return false;
 	if (inode->flags & BTRFS_INODE_NODATACOW ||
 	    inode->flags & BTRFS_INODE_NODATASUM)
 		return false;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0ba98e08a029..4d809899c076 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3115,6 +3115,12 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 		goto out;
 	}
 
+	/* Subpage defrag will be supported in later commits */
+	if (root->fs_info->sectorsize < PAGE_SIZE) {
+		ret = -ENOTTY;
+		goto out;
+	}
+
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFDIR:
 		if (!capable(CAP_SYS_ADMIN)) {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 35ff142ad242..0d2e3ab2fc31 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2041,13 +2041,6 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 			ret = -EINVAL;
 			goto restore;
 		}
-		if (fs_info->sectorsize < PAGE_SIZE) {
-			btrfs_warn(fs_info,
-	"read-write mount is not yet allowed for sectorsize %u page size %lu",
-				   fs_info->sectorsize, PAGE_SIZE);
-			ret = -EINVAL;
-			goto restore;
-		}
 
 		/*
 		 * NOTE: when remounting with a change that does writes, don't
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 9d1d140118ff..d9d53a255ef9 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -366,6 +366,10 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
 {
 	ssize_t ret = 0;
 
+	/* 4K sector size is also supported with 64K page size */
+	if (PAGE_SIZE == SZ_64K)
+		ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%u ", SZ_4K);
+
 	/* Only sectorsize == PAGE_SIZE is now supported */
 	ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%lu\n", PAGE_SIZE);
 
-- 
cgit v1.2.3


From 963e4db83e2832ee5e760f3c7f92d68bd66156f6 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 26 Jul 2021 14:35:07 +0800
Subject: btrfs: unify regular and subpage error paths in __extent_writepage()

[BUG]
When running btrfs/160 in a loop for subpage with experimental
compression support, it has a high chance to crash (~20%):

 BTRFS critical (device dm-7): panic in __btrfs_add_ordered_extent:238: inconsistency in ordered tree at offset 0 (errno=-17 Object already exists)
 ------------[ cut here ]------------
 kernel BUG at fs/btrfs/ordered-data.c:238!
 Internal error: Oops - BUG: 0 [#1] SMP
 pc : __btrfs_add_ordered_extent+0x550/0x670 [btrfs]
 lr : __btrfs_add_ordered_extent+0x550/0x670 [btrfs]
 Call trace:
  __btrfs_add_ordered_extent+0x550/0x670 [btrfs]
  btrfs_add_ordered_extent+0x2c/0x50 [btrfs]
  run_delalloc_nocow+0x81c/0x8fc [btrfs]
  btrfs_run_delalloc_range+0xa4/0x390 [btrfs]
  writepage_delalloc+0xc0/0x1ac [btrfs]
  __extent_writepage+0xf4/0x370 [btrfs]
  extent_write_cache_pages+0x288/0x4f4 [btrfs]
  extent_writepages+0x58/0xe0 [btrfs]
  btrfs_writepages+0x1c/0x30 [btrfs]
  do_writepages+0x60/0x110
  __filemap_fdatawrite_range+0x108/0x170
  filemap_fdatawrite_range+0x20/0x30
  btrfs_fdatawrite_range+0x34/0x4dc [btrfs]
  __btrfs_write_out_cache+0x34c/0x480 [btrfs]
  btrfs_write_out_cache+0x144/0x220 [btrfs]
  btrfs_start_dirty_block_groups+0x3ac/0x6b0 [btrfs]
  btrfs_commit_transaction+0xd0/0xbb4 [btrfs]
  btrfs_sync_fs+0x64/0x1cc [btrfs]
  sync_fs_one_sb+0x3c/0x50
  iterate_supers+0xcc/0x1d4
  ksys_sync+0x6c/0xd0
  __arm64_sys_sync+0x1c/0x30
  invoke_syscall+0x50/0x120
  el0_svc_common.constprop.0+0x4c/0xd4
  do_el0_svc+0x30/0x9c
  el0_svc+0x2c/0x54
  el0_sync_handler+0x1a8/0x1b0
  el0_sync+0x198/0x1c0
 ---[ end trace 336f67369ae6e0af ]---

[CAUSE]
For subpage case, we can have multiple sectors inside a page, this makes
it possible for __extent_writepage() to have part of its page submitted
before returning.

In btrfs/160, we are using dm-dust to emulate write error, this means
for certain pages, we could have everything running fine, but at the end
of __extent_writepage(), one of the submitted bios fails due to dm-dust.

Then the page is marked Error, and we change @ret from 0 to -EIO.

This makes the caller extent_write_cache_pages() to error out, without
submitting the remaining pages.

Furthermore, since we're erroring out for free space cache, it doesn't
really care about the error and will update the inode and retry the
writeback.

Then we re-run the delalloc range, and will try to insert the same
delalloc range while previous delalloc range is still hanging there,
triggering the above error.

[FIX]
The proper fix is to handle errors from __extent_writepage() properly,
by ending the remaining ordered extent.

But that fix needs the following changes:

- Know at exactly which sector the error happened
  Currently __extent_writepage_io() works for the full page, can't
  return at which sector we hit the error.

- Grab the ordered extent covering the failed sector

As a hotfix for subpage case, here we unify the error paths in
__extent_writepage().

In fact, the "if (PageError(page))" branch never get executed if @ret is
still 0 for non-subpage cases.

As for non-subpage case, we never submit current page in
__extent_writepage(), but only add current page into bio.
The bio can only get submitted in next page.

Thus we never get PageError() set due to IO failure, thus when we hit
the branch, @ret is never 0.

By simply removing that @ret assignment, we let subpage case ignore the
IO failure, thus only error out for fatal errors just like regular
sectorsize.

So that IO error won't be treated as fatal error not trigger the hanging
OE problem.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 51 ++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 44 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5843f4948431..378ae860b3f3 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2789,8 +2789,14 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
 	btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate);
 
 	if (!uptodate) {
-		ClearPageUptodate(page);
-		SetPageError(page);
+		const struct btrfs_fs_info *fs_info = inode->root->fs_info;
+		u32 len;
+
+		ASSERT(end + 1 - start <= U32_MAX);
+		len = end + 1 - start;
+
+		btrfs_page_clear_uptodate(fs_info, page, start, len);
+		btrfs_page_set_error(fs_info, page, start, len);
 		ret = err < 0 ? err : -EIO;
 		mapping_set_error(page->mapping, ret);
 	}
@@ -3793,7 +3799,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 		ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
 				delalloc_end, &page_started, nr_written, wbc);
 		if (ret) {
-			SetPageError(page);
+			btrfs_page_set_error(inode->root->fs_info, page,
+					     page_offset(page), PAGE_SIZE);
 			/*
 			 * btrfs_run_delalloc_range should return < 0 for error
 			 * but just in case, we use > 0 here meaning the IO is
@@ -4067,7 +4074,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
 	WARN_ON(!PageLocked(page));
 
-	ClearPageError(page);
+	btrfs_page_clear_error(btrfs_sb(inode->i_sb), page,
+			       page_offset(page), PAGE_SIZE);
 
 	pg_offset = offset_in_page(i_size);
 	if (page->index > end_index ||
@@ -4108,10 +4116,39 @@ done:
 		set_page_writeback(page);
 		end_page_writeback(page);
 	}
-	if (PageError(page)) {
-		ret = ret < 0 ? ret : -EIO;
+	/*
+	 * Here we used to have a check for PageError() and then set @ret and
+	 * call end_extent_writepage().
+	 *
+	 * But in fact setting @ret here will cause different error paths
+	 * between subpage and regular sectorsize.
+	 *
+	 * For regular page size, we never submit current page, but only add
+	 * current page to current bio.
+	 * The bio submission can only happen in next page.
+	 * Thus if we hit the PageError() branch, @ret is already set to
+	 * non-zero value and will not get updated for regular sectorsize.
+	 *
+	 * But for subpage case, it's possible we submit part of current page,
+	 * thus can get PageError() set by submitted bio of the same page,
+	 * while our @ret is still 0.
+	 *
+	 * So here we unify the behavior and don't set @ret.
+	 * Error can still be properly passed to higher layer as page will
+	 * be set error, here we just don't handle the IO failure.
+	 *
+	 * NOTE: This is just a hotfix for subpage.
+	 * The root fix will be properly ending ordered extent when we hit
+	 * an error during writeback.
+	 *
+	 * But that needs a bigger refactoring, as we not only need to grab the
+	 * submitted OE, but also need to know exactly at which bytenr we hit
+	 * the error.
+	 * Currently the full page based __extent_writepage_io() is not
+	 * capable of that.
+	 */
+	if (PageError(page))
 		end_extent_writepage(page, ret, start, page_end);
-	}
 	unlock_page(page);
 	ASSERT(ret <= 0);
 	return ret;
-- 
cgit v1.2.3


From ac98141d140444fe93e26471d3074c603b70e2ca Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Jul 2021 14:47:17 -0400
Subject: btrfs: wake up async_delalloc_pages waiters after submit

We use the async_delalloc_pages mechanism to make sure that we've
completed our async work before trying to continue our delalloc
flushing.  The reason for this is we need to see any ordered extents
that were created by our delalloc flushing.  However we're waking up
before we do the submit work, which is before we create the ordered
extents.  This is a pretty wide race window where we could potentially
think there are no ordered extents and thus exit shrink_delalloc
prematurely.  Fix this by waking us up after we've done the work to
create ordered extents.

CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7e63b5c0a1dc..73b6413cc9af 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1306,11 +1306,6 @@ static noinline void async_cow_submit(struct btrfs_work *work)
 	nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
 		PAGE_SHIFT;
 
-	/* atomic_sub_return implies a barrier */
-	if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
-	    5 * SZ_1M)
-		cond_wake_up_nomb(&fs_info->async_submit_wait);
-
 	/*
 	 * ->inode could be NULL if async_chunk_start has failed to compress,
 	 * in which case we don't have anything to submit, yet we need to
@@ -1319,6 +1314,11 @@ static noinline void async_cow_submit(struct btrfs_work *work)
 	 */
 	if (async_chunk->inode)
 		submit_compressed_extents(async_chunk);
+
+	/* atomic_sub_return implies a barrier */
+	if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
+	    5 * SZ_1M)
+		cond_wake_up_nomb(&fs_info->async_submit_wait);
 }
 
 static noinline void async_cow_free(struct btrfs_work *work)
-- 
cgit v1.2.3


From 8197766d806f02d69d2f54563e0b4672bddcc535 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Jul 2021 14:47:18 -0400
Subject: btrfs: include delalloc related info in dump space info tracepoint

In order to debug delalloc flushing issues I added delalloc_bytes and
ordered_bytes to this tracepoint to see if they were non-zero when we
were going ENOSPC. This was valuable for me and showed me cases where we
weren't waiting on ordered extents properly. In order to add this to the
tracepoint we need to take away the const modifier for fs_info, as
percpu_sum_counter_positive() will change the counter when it adds up
the percpu buckets.  This is needed to make sure we're getting accurate
information at these tracepoints, as the wrong information could send us
down the wrong path when debugging problems.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/trace/events/btrfs.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index b671b1f2ce0f..97e16a34405b 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -2037,7 +2037,7 @@ TRACE_EVENT(btrfs_convert_extent_bit,
 );
 
 DECLARE_EVENT_CLASS(btrfs_dump_space_info,
-	TP_PROTO(const struct btrfs_fs_info *fs_info,
+	TP_PROTO(struct btrfs_fs_info *fs_info,
 		 const struct btrfs_space_info *sinfo),
 
 	TP_ARGS(fs_info, sinfo),
@@ -2057,6 +2057,8 @@ DECLARE_EVENT_CLASS(btrfs_dump_space_info,
 		__field(	u64,	delayed_refs_reserved	)
 		__field(	u64,	delayed_reserved	)
 		__field(	u64,	free_chunk_space	)
+		__field(	u64,	delalloc_bytes		)
+		__field(	u64,	ordered_bytes		)
 	),
 
 	TP_fast_assign_btrfs(fs_info,
@@ -2074,6 +2076,8 @@ DECLARE_EVENT_CLASS(btrfs_dump_space_info,
 		__entry->delayed_refs_reserved	=	fs_info->delayed_refs_rsv.reserved;
 		__entry->delayed_reserved	=	fs_info->delayed_block_rsv.reserved;
 		__entry->free_chunk_space	=	atomic64_read(&fs_info->free_chunk_space);
+		__entry->delalloc_bytes		=	percpu_counter_sum_positive(&fs_info->delalloc_bytes);
+		__entry->ordered_bytes		=	percpu_counter_sum_positive(&fs_info->ordered_bytes);
 	),
 
 	TP_printk_btrfs("flags=%s total_bytes=%llu bytes_used=%llu "
@@ -2081,7 +2085,8 @@ DECLARE_EVENT_CLASS(btrfs_dump_space_info,
 			"bytes_may_use=%llu bytes_readonly=%llu "
 			"reclaim_size=%llu clamp=%d global_reserved=%llu "
 			"trans_reserved=%llu delayed_refs_reserved=%llu "
-			"delayed_reserved=%llu chunk_free_space=%llu",
+			"delayed_reserved=%llu chunk_free_space=%llu "
+			"delalloc_bytes=%llu ordered_bytes=%llu",
 			__print_flags(__entry->flags, "|", BTRFS_GROUP_FLAGS),
 			__entry->total_bytes, __entry->bytes_used,
 			__entry->bytes_pinned, __entry->bytes_reserved,
@@ -2089,11 +2094,12 @@ DECLARE_EVENT_CLASS(btrfs_dump_space_info,
 			__entry->reclaim_size, __entry->clamp,
 			__entry->global_reserved, __entry->trans_reserved,
 			__entry->delayed_refs_reserved,
-			__entry->delayed_reserved, __entry->free_chunk_space)
+			__entry->delayed_reserved, __entry->free_chunk_space,
+			__entry->delalloc_bytes, __entry->ordered_bytes)
 );
 
 DEFINE_EVENT(btrfs_dump_space_info, btrfs_done_preemptive_reclaim,
-	TP_PROTO(const struct btrfs_fs_info *fs_info,
+	TP_PROTO(struct btrfs_fs_info *fs_info,
 		 const struct btrfs_space_info *sinfo),
 	TP_ARGS(fs_info, sinfo)
 );
-- 
cgit v1.2.3


From fcdef39c03c5beb2a7bcb627addb0b259b9c5164 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Jul 2021 14:47:19 -0400
Subject: btrfs: enable a tracepoint when we fail tickets

When debugging early enospc problems it was useful to have a tracepoint
where we failed all tickets so I could check the state of the enospc
counters at failure time to validate my fixes.  This adds the tracpoint
so you can easily get that information.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/space-info.c        | 2 ++
 include/trace/events/btrfs.h | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index f79bf85f2439..fbd492fe87f9 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -824,6 +824,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
 	struct reserve_ticket *ticket;
 	u64 tickets_id = space_info->tickets_id;
 
+	trace_btrfs_fail_all_tickets(fs_info, space_info);
+
 	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
 		btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
 		__btrfs_dump_space_info(fs_info, space_info);
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 97e16a34405b..c7d19eadecc5 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -2104,6 +2104,12 @@ DEFINE_EVENT(btrfs_dump_space_info, btrfs_done_preemptive_reclaim,
 	TP_ARGS(fs_info, sinfo)
 );
 
+DEFINE_EVENT(btrfs_dump_space_info, btrfs_fail_all_tickets,
+	TP_PROTO(struct btrfs_fs_info *fs_info,
+		 const struct btrfs_space_info *sinfo),
+	TP_ARGS(fs_info, sinfo)
+);
+
 TRACE_EVENT(btrfs_reserve_ticket,
 	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 flags, u64 bytes,
 		 u64 start_ns, int flush, int error),
-- 
cgit v1.2.3


From 03fe78cc2942c55cc13be5ca42578750f17204a1 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Jul 2021 14:47:20 -0400
Subject: btrfs: use delalloc_bytes to determine flush amount for
 shrink_delalloc

We have been hitting some early ENOSPC issues in production with more
recent kernels, and I tracked it down to us simply not flushing delalloc
as aggressively as we should be.  With tracing I was seeing us failing
all tickets with all of the block rsvs at or around 0, with very little
pinned space, but still around 120MiB of outstanding bytes_may_used.
Upon further investigation I saw that we were flushing around 14 pages
per shrink call for delalloc, despite having around 2GiB of delalloc
outstanding.

Consider the example of a 8 way machine, all CPUs trying to create a
file in parallel, which at the time of this commit requires 5 items to
do.  Assuming a 16k leaf size, we have 10MiB of total metadata reclaim
size waiting on reservations.  Now assume we have 128MiB of delalloc
outstanding.  With our current math we would set items to 20, and then
set to_reclaim to 20 * 256k, or 5MiB.

Assuming that we went through this loop all 3 times, for both
FLUSH_DELALLOC and FLUSH_DELALLOC_WAIT, and then did the full loop
twice, we'd only flush 60MiB of the 128MiB delalloc space.  This could
leave a fair bit of delalloc reservations still hanging around by the
time we go to ENOSPC out all the remaining tickets.

Fix this two ways.  First, change the calculations to be a fraction of
the total delalloc bytes on the system.  Prior to this change we were
calculating based on dirty inodes so our math made more sense, now it's
just completely unrelated to what we're actually doing.

Second add a FLUSH_DELALLOC_FULL state, that we hold off until we've
gone through the flush states at least once.  This will empty the system
of all delalloc so we're sure to be truly out of space when we start
failing tickets.

I'm tagging stable 5.10 and forward, because this is where we started
using the page stuff heavily again.  This affects earlier kernel
versions as well, but would be a pain to backport to them as the
flushing mechanisms aren't the same.

CC: stable@vger.kernel.org # 5.10+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h             |  9 +++++----
 fs/btrfs/space-info.c        | 40 ++++++++++++++++++++++++++++------------
 include/trace/events/btrfs.h |  1 +
 3 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3cccf0f05666..fd3084feb4b5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2779,10 +2779,11 @@ enum btrfs_flush_state {
 	FLUSH_DELAYED_REFS	=	4,
 	FLUSH_DELALLOC		=	5,
 	FLUSH_DELALLOC_WAIT	=	6,
-	ALLOC_CHUNK		=	7,
-	ALLOC_CHUNK_FORCE	=	8,
-	RUN_DELAYED_IPUTS	=	9,
-	COMMIT_TRANS		=	10,
+	FLUSH_DELALLOC_FULL	=	7,
+	ALLOC_CHUNK		=	8,
+	ALLOC_CHUNK_FORCE	=	9,
+	RUN_DELAYED_IPUTS	=	10,
+	COMMIT_TRANS		=	11,
 };
 
 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index fbd492fe87f9..eb90a262563f 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -493,6 +493,11 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
 	long time_left;
 	int loops;
 
+	delalloc_bytes = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
+	ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
+	if (delalloc_bytes == 0 && ordered_bytes == 0)
+		return;
+
 	/* Calc the number of the pages we need flush for space reservation */
 	if (to_reclaim == U64_MAX) {
 		items = U64_MAX;
@@ -500,22 +505,21 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
 		/*
 		 * to_reclaim is set to however much metadata we need to
 		 * reclaim, but reclaiming that much data doesn't really track
-		 * exactly, so increase the amount to reclaim by 2x in order to
-		 * make sure we're flushing enough delalloc to hopefully reclaim
-		 * some metadata reservations.
+		 * exactly.  What we really want to do is reclaim full inode's
+		 * worth of reservations, however that's not available to us
+		 * here.  We will take a fraction of the delalloc bytes for our
+		 * flushing loops and hope for the best.  Delalloc will expand
+		 * the amount we write to cover an entire dirty extent, which
+		 * will reclaim the metadata reservation for that range.  If
+		 * it's not enough subsequent flush stages will be more
+		 * aggressive.
 		 */
+		to_reclaim = max(to_reclaim, delalloc_bytes >> 3);
 		items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
-		to_reclaim = items * EXTENT_SIZE_PER_ITEM;
 	}
 
 	trans = (struct btrfs_trans_handle *)current->journal_info;
 
-	delalloc_bytes = percpu_counter_sum_positive(
-						&fs_info->delalloc_bytes);
-	ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
-	if (delalloc_bytes == 0 && ordered_bytes == 0)
-		return;
-
 	/*
 	 * If we are doing more ordered than delalloc we need to just wait on
 	 * ordered extents, otherwise we'll waste time trying to flush delalloc
@@ -595,8 +599,11 @@ static void flush_space(struct btrfs_fs_info *fs_info,
 		break;
 	case FLUSH_DELALLOC:
 	case FLUSH_DELALLOC_WAIT:
+	case FLUSH_DELALLOC_FULL:
+		if (state == FLUSH_DELALLOC_FULL)
+			num_bytes = U64_MAX;
 		shrink_delalloc(fs_info, space_info, num_bytes,
-				state == FLUSH_DELALLOC_WAIT, for_preempt);
+				state != FLUSH_DELALLOC, for_preempt);
 		break;
 	case FLUSH_DELAYED_REFS_NR:
 	case FLUSH_DELAYED_REFS:
@@ -906,6 +913,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 				commit_cycles--;
 		}
 
+		/*
+		 * We do not want to empty the system of delalloc unless we're
+		 * under heavy pressure, so allow one trip through the flushing
+		 * logic before we start doing a FLUSH_DELALLOC_FULL.
+		 */
+		if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles)
+			flush_state++;
+
 		/*
 		 * We don't want to force a chunk allocation until we've tried
 		 * pretty hard to reclaim space.  Think of the case where we
@@ -1069,7 +1084,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
  *   so if we now have space to allocate do the force chunk allocation.
  */
 static const enum btrfs_flush_state data_flush_states[] = {
-	FLUSH_DELALLOC_WAIT,
+	FLUSH_DELALLOC_FULL,
 	RUN_DELAYED_IPUTS,
 	COMMIT_TRANS,
 	ALLOC_CHUNK_FORCE,
@@ -1158,6 +1173,7 @@ static const enum btrfs_flush_state evict_flush_states[] = {
 	FLUSH_DELAYED_REFS,
 	FLUSH_DELALLOC,
 	FLUSH_DELALLOC_WAIT,
+	FLUSH_DELALLOC_FULL,
 	ALLOC_CHUNK,
 	COMMIT_TRANS,
 };
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index c7d19eadecc5..8f58fd95efc7 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -94,6 +94,7 @@ struct btrfs_space_info;
 	EM( FLUSH_DELAYED_ITEMS,	"FLUSH_DELAYED_ITEMS")		\
 	EM( FLUSH_DELALLOC,		"FLUSH_DELALLOC")		\
 	EM( FLUSH_DELALLOC_WAIT,	"FLUSH_DELALLOC_WAIT")		\
+	EM( FLUSH_DELALLOC_FULL,	"FLUSH_DELALLOC_FULL")		\
 	EM( FLUSH_DELAYED_REFS_NR,	"FLUSH_DELAYED_REFS_NR")	\
 	EM( FLUSH_DELAYED_REFS,		"FLUSH_ELAYED_REFS")		\
 	EM( ALLOC_CHUNK,		"ALLOC_CHUNK")			\
-- 
cgit v1.2.3


From e16460707e94c3d4c1b5418cb68b28b8efa903b2 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Jul 2021 14:47:21 -0400
Subject: btrfs: wait on async extents when flushing delalloc

I've been debugging an early ENOSPC problem in production and finally
root caused it to this problem.  When we switched to the per-inode in
38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in
shrink_delalloc") I pulled out the async extent handling, because we
were doing the correct thing by calling filemap_flush() if we had async
extents set.  This would properly wait on any async extents by locking
the page in the second flush, thus making sure our ordered extents were
properly set up.

However when I switched us back to page based flushing, I used
sync_inode(), which allows us to pass in our own wbc.  The problem here
is that sync_inode() is smarter than the filemap_* helpers, it tries to
avoid calling writepages at all.  This means that our second call could
skip calling do_writepages altogether, and thus not wait on the pagelock
for the async helpers.  This means we could come back before any ordered
extents were created and then simply continue on in our flushing
mechanisms and ENOSPC out when we have plenty of space to use.

Fix this by putting back the async pages logic in shrink_delalloc.  This
allows us to bulk write out everything that we need to, and then we can
wait in one place for the async helpers to catch up, and then wait on
any ordered extents that are created.

Fixes: e076ab2a2ca7 ("btrfs: shrink delalloc pages instead of full inodes")
CC: stable@vger.kernel.org # 5.10+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c      |  4 ----
 fs/btrfs/space-info.c | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 73b6413cc9af..25eb214f56ac 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9879,10 +9879,6 @@ static int start_delalloc_inodes(struct btrfs_root *root,
 					 &work->work);
 		} else {
 			ret = sync_inode(inode, wbc);
-			if (!ret &&
-			    test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-				     &BTRFS_I(inode)->runtime_flags))
-				ret = sync_inode(inode, wbc);
 			btrfs_add_delayed_iput(inode);
 			if (ret || wbc->nr_to_write <= 0)
 				goto out;
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index eb90a262563f..d9c8d738678f 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -532,9 +532,49 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
 	while ((delalloc_bytes || ordered_bytes) && loops < 3) {
 		u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
 		long nr_pages = min_t(u64, temp, LONG_MAX);
+		int async_pages;
 
 		btrfs_start_delalloc_roots(fs_info, nr_pages, true);
 
+		/*
+		 * We need to make sure any outstanding async pages are now
+		 * processed before we continue.  This is because things like
+		 * sync_inode() try to be smart and skip writing if the inode is
+		 * marked clean.  We don't use filemap_fwrite for flushing
+		 * because we want to control how many pages we write out at a
+		 * time, thus this is the only safe way to make sure we've
+		 * waited for outstanding compressed workers to have started
+		 * their jobs and thus have ordered extents set up properly.
+		 *
+		 * This exists because we do not want to wait for each
+		 * individual inode to finish its async work, we simply want to
+		 * start the IO on everybody, and then come back here and wait
+		 * for all of the async work to catch up.  Once we're done with
+		 * that we know we'll have ordered extents for everything and we
+		 * can decide if we wait for that or not.
+		 *
+		 * If we choose to replace this in the future, make absolutely
+		 * sure that the proper waiting is being done in the async case,
+		 * as there have been bugs in that area before.
+		 */
+		async_pages = atomic_read(&fs_info->async_delalloc_pages);
+		if (!async_pages)
+			goto skip_async;
+
+		/*
+		 * We don't want to wait forever, if we wrote less pages in this
+		 * loop than we have outstanding, only wait for that number of
+		 * pages, otherwise we can wait for all async pages to finish
+		 * before continuing.
+		 */
+		if (async_pages > nr_pages)
+			async_pages -= nr_pages;
+		else
+			async_pages = 0;
+		wait_event(fs_info->async_submit_wait,
+			   atomic_read(&fs_info->async_delalloc_pages) <=
+			   async_pages);
+skip_async:
 		loops++;
 		if (wait_ordered && !trans) {
 			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
-- 
cgit v1.2.3


From 5a798493b8f30121363359bba834392f044c169b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Jul 2021 14:47:22 -0400
Subject: fs: add a filemap_fdatawrite_wbc helper

Btrfs sometimes needs to flush dirty pages on a bunch of dirty inodes in
order to reclaim metadata reservations.  Unfortunately most helpers in
this area are too smart for us:

1) The normal filemap_fdata* helpers only take range and sync modes, and
   don't give any indication of how much was written, so we can only
   flush full inodes, which isn't what we want in most cases.
2) The normal writeback path requires us to have the s_umount sem held,
   but we can't unconditionally take it in this path because we could
   deadlock.
3) The normal writeback path also skips inodes with I_SYNC set if we
   write with WB_SYNC_NONE.  This isn't the behavior we want under heavy
   ENOSPC pressure, we want to actually make sure the pages are under
   writeback before returning, and if another thread is in the middle of
   writing the file we may return before they're under writeback and
   miss our ordered extents and not properly wait for completion.
4) sync_inode() uses the normal writeback path and has the same problem
   as #3.

What we really want is to call do_writepages() with our wbc.  This way
we can make sure that writeback is actually started on the pages, and we
can control how many pages are written as a whole as we write many
inodes using the same wbc.  Accomplish this with a new helper that does
just that so we can use it for our ENOSPC flushing infrastructure.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/linux/fs.h |  2 ++
 mm/filemap.c       | 36 +++++++++++++++++++++++++++---------
 2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 640574294216..452cd4843843 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2891,6 +2891,8 @@ extern int filemap_fdatawrite_range(struct address_space *mapping,
 				loff_t start, loff_t end);
 extern int filemap_check_errors(struct address_space *mapping);
 extern void __filemap_set_wb_err(struct address_space *mapping, int err);
+int filemap_fdatawrite_wbc(struct address_space *mapping,
+			   struct writeback_control *wbc);
 
 static inline int filemap_write_and_wait(struct address_space *mapping)
 {
diff --git a/mm/filemap.c b/mm/filemap.c
index d1458ecf2f51..034d370d4ebb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -377,6 +377,32 @@ static int filemap_check_and_keep_errors(struct address_space *mapping)
 	return 0;
 }
 
+/**
+ * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range
+ * @mapping:	address space structure to write
+ * @wbc:	the writeback_control controlling the writeout
+ *
+ * Call writepages on the mapping using the provided wbc to control the
+ * writeout.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int filemap_fdatawrite_wbc(struct address_space *mapping,
+			   struct writeback_control *wbc)
+{
+	int ret;
+
+	if (!mapping_can_writeback(mapping) ||
+	    !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+		return 0;
+
+	wbc_attach_fdatawrite_inode(wbc, mapping->host);
+	ret = do_writepages(mapping, wbc);
+	wbc_detach_inode(wbc);
+	return ret;
+}
+EXPORT_SYMBOL(filemap_fdatawrite_wbc);
+
 /**
  * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
  * @mapping:	address space structure to write
@@ -397,7 +423,6 @@ static int filemap_check_and_keep_errors(struct address_space *mapping)
 int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 				loff_t end, int sync_mode)
 {
-	int ret;
 	struct writeback_control wbc = {
 		.sync_mode = sync_mode,
 		.nr_to_write = LONG_MAX,
@@ -405,14 +430,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 		.range_end = end,
 	};
 
-	if (!mapping_can_writeback(mapping) ||
-	    !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
-		return 0;
-
-	wbc_attach_fdatawrite_inode(&wbc, mapping->host);
-	ret = do_writepages(mapping, &wbc);
-	wbc_detach_inode(&wbc);
-	return ret;
+	return filemap_fdatawrite_wbc(mapping, &wbc);
 }
 
 static inline int __filemap_fdatawrite(struct address_space *mapping,
-- 
cgit v1.2.3


From b3776305278e5937366f512d3e655ace13b8b027 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Jul 2021 14:47:23 -0400
Subject: btrfs: use the filemap_fdatawrite_wbc helper for delalloc shrinking

sync_inode() has some holes that can cause problems if we're under heavy
ENOSPC pressure.  If there's writeback running on a separate thread
sync_inode() will skip writing the inode altogether.  What we really
want is to make sure writeback has been started on all the pages to make
sure we can see the ordered extents and wait on them if appropriate.
Switch to this new helper which will allow us to accomplish this and
avoid ENOSPC'ing early.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 25eb214f56ac..c917e3117842 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9878,7 +9878,7 @@ static int start_delalloc_inodes(struct btrfs_root *root,
 			btrfs_queue_work(root->fs_info->flush_workers,
 					 &work->work);
 		} else {
-			ret = sync_inode(inode, wbc);
+			ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
 			btrfs_add_delayed_iput(inode);
 			if (ret || wbc->nr_to_write <= 0)
 				goto out;
-- 
cgit v1.2.3


From 25d23cd01621b740ce3e8f4f0fab40e24d163462 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Jul 2021 14:47:24 -0400
Subject: 9p: migrate from sync_inode to filemap_fdatawrite_wbc

We're going to remove sync_inode, so migrate to filemap_fdatawrite_wbc
instead.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/9p/vfs_file.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 59c32c9b799f..6b64e8391f30 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -625,12 +625,7 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
 	p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma);
 
 	inode = file_inode(vma->vm_file);
-
-	if (!mapping_can_writeback(inode->i_mapping))
-		wbc.nr_to_write = 0;
-
-	might_sleep();
-	sync_inode(inode, &wbc);
+	filemap_fdatawrite_wbc(inode->i_mapping, &wbc);
 }
 
 
-- 
cgit v1.2.3


From 5662c967c69dfd162a0667d69bad776939bedf85 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Jul 2021 14:47:25 -0400
Subject: fs: kill sync_inode

Now that all users of sync_inode() have been deleted, remove
sync_inode().

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/fs-writeback.c  | 19 +------------------
 include/linux/fs.h |  1 -
 2 files changed, 1 insertion(+), 19 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 4c3370548982..eb57dade6076 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -2729,23 +2729,6 @@ int write_inode_now(struct inode *inode, int sync)
 }
 EXPORT_SYMBOL(write_inode_now);
 
-/**
- * sync_inode - write an inode and its pages to disk.
- * @inode: the inode to sync
- * @wbc: controls the writeback mode
- *
- * sync_inode() will write an inode and its pages to disk.  It will also
- * correctly update the inode on its superblock's dirty inode lists and will
- * update inode->i_state.
- *
- * The caller must have a ref on the inode.
- */
-int sync_inode(struct inode *inode, struct writeback_control *wbc)
-{
-	return writeback_single_inode(inode, wbc);
-}
-EXPORT_SYMBOL(sync_inode);
-
 /**
  * sync_inode_metadata - write an inode to disk
  * @inode: the inode to sync
@@ -2762,6 +2745,6 @@ int sync_inode_metadata(struct inode *inode, int wait)
 		.nr_to_write = 0, /* metadata-only */
 	};
 
-	return sync_inode(inode, &wbc);
+	return writeback_single_inode(inode, &wbc);
 }
 EXPORT_SYMBOL(sync_inode_metadata);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 452cd4843843..1751addcb36e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2457,7 +2457,6 @@ static inline void file_accessed(struct file *file)
 
 extern int file_modified(struct file *file);
 
-int sync_inode(struct inode *inode, struct writeback_control *wbc);
 int sync_inode_metadata(struct inode *inode, int wait);
 
 struct file_system_type {
-- 
cgit v1.2.3


From e83502ca5f1e1f03fb1393008ec22d17e7dc9882 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 21 Jul 2021 21:43:32 +0900
Subject: block: fix argument type of bio_trim()

The function bio_trim has offset and size arguments that are declared
as int.

The callers of this function use sector_t type when passing the offset
and size, e.g. drivers/md/raid1.c:narrow_write_error() and
drivers/md/raid1.c:narrow_write_error().

Change offset and size arguments to sector_t type for bio_trim(). Also,
add WARN_ON_ONCE() to catch their overflow.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 block/bio.c               | 12 +++++++-----
 include/linux/bio.h       |  2 +-
 include/linux/blk_types.h |  1 +
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 1fab762e079b..77cadcba93b9 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1463,12 +1463,15 @@ EXPORT_SYMBOL(bio_split);
  * @bio:	bio to trim
  * @offset:	number of sectors to trim from the front of @bio
  * @size:	size we want to trim @bio to, in sectors
+ *
+ * This function is typically used for bios that are cloned and submitted
+ * to the underlying device in parts.
  */
-void bio_trim(struct bio *bio, int offset, int size)
+void bio_trim(struct bio *bio, sector_t offset, sector_t size)
 {
-	/* 'bio' is a cloned bio which we need to trim to match
-	 * the given offset and size.
-	 */
+	if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS ||
+			 offset + size > bio->bi_iter.bi_size))
+		return;
 
 	size <<= 9;
 	if (offset == 0 && size == bio->bi_iter.bi_size)
@@ -1479,7 +1482,6 @@ void bio_trim(struct bio *bio, int offset, int size)
 
 	if (bio_integrity(bio))
 		bio_integrity_trim(bio);
-
 }
 EXPORT_SYMBOL_GPL(bio_trim);
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 2203b686e1f0..8a451d77b573 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -375,7 +375,7 @@ static inline void bip_set_seed(struct bio_integrity_payload *bip,
 
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
-extern void bio_trim(struct bio *bio, int offset, int size);
+void bio_trim(struct bio *bio, sector_t offset, sector_t size);
 extern struct bio *bio_split(struct bio *bio, int sectors,
 			     gfp_t gfp, struct bio_set *bs);
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 290f9061b29a..bca4d33876d4 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -281,6 +281,7 @@ struct bio {
 };
 
 #define BIO_RESET_BYTES		offsetof(struct bio, bi_max_vecs)
+#define BIO_MAX_SECTORS		(UINT_MAX >> SECTOR_SHIFT)
 
 /*
  * bio flags
-- 
cgit v1.2.3


From 21dda654d4808833668b380e5f0b9befff8640ae Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 21 Jul 2021 21:43:33 +0900
Subject: btrfs: fix argument type of btrfs_bio_clone_partial()

The offset and can never be negative use unsigned int instead of int
type for them.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 4 +++-
 fs/btrfs/extent_io.h | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 378ae860b3f3..3abb9365fbe9 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3161,11 +3161,13 @@ struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
 	return bio;
 }
 
-struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
 {
 	struct bio *bio;
 	struct btrfs_io_bio *btrfs_bio;
 
+	ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
+
 	/* this will never fail when it's backed by a bioset */
 	bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
 	ASSERT(bio);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 62027f551b44..53abdc280451 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -280,7 +280,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
 struct bio *btrfs_bio_alloc(u64 first_byte);
 struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
 struct bio *btrfs_bio_clone(struct bio *bio);
-struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size);
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
 
 int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 		      u64 length, u64 logical, struct page *page,
-- 
cgit v1.2.3


From 42b5d73b5d231bbe38639c6dec913505d7f55372 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Wed, 21 Jul 2021 21:43:34 +0900
Subject: btrfs: drop unnecessary ASSERT from btrfs_submit_direct()

When on SINGLE block group, btrfs_get_io_geometry() will return "the
size of the block group - the offset of the logical address within the
block group" as geom.len. Since we allow up to 8 GiB zone size on zoned
filesystem, we can have up to 8 GiB block group, so can have up to 8 GiB
geom.len as well. With this setup, we easily hit the "ASSERT(geom.len <=
INT_MAX);".

The ASSERT looks like to guard btrfs_bio_clone_partial() and bio_trim()
which both take "int" (now u64 due to the previous patch). So to be
precise the ASSERT should check if clone_len <= UINT_MAX. But actually,
clone_len is already capped by bio.bi_iter.bi_size which is unsigned
int. So the ASSERT is not necessary.

Drop the ASSERT and properly compare submit_len and geom.len in u64.
Then, let the implicit casting to convert it to u64.

Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c917e3117842..3ed88f2ac393 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8239,8 +8239,8 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
 	u64 start_sector;
 	int async_submit = 0;
 	u64 submit_len;
-	int clone_offset = 0;
-	int clone_len;
+	u64 clone_offset = 0;
+	u64 clone_len;
 	u64 logical;
 	int ret;
 	blk_status_t status;
@@ -8288,9 +8288,9 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
 			status = errno_to_blk_status(ret);
 			goto out_err_em;
 		}
-		ASSERT(geom.len <= INT_MAX);
 
-		clone_len = min_t(int, submit_len, geom.len);
+		clone_len = min(submit_len, geom.len);
+		ASSERT(clone_len <= UINT_MAX);
 
 		/*
 		 * This will never fail as it's passing GPF_NOFS and
-- 
cgit v1.2.3


From 6e8e777deb5cbff76bcd34b1f45bc747f48e8abe Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 27 Jul 2021 11:24:44 +0100
Subject: btrfs: eliminate some false positives when checking if inode was
 logged

When checking if an inode was previously logged in the current transaction
through the helper inode_logged(), we can return some false positives that
can be easily eliminated. These correspond to the cases where an inode has
a ->logged_trans value that is not zero and its value is smaller then the
ID of the current transaction. This means we know exactly that the inode
was never logged before in the current transaction, so we can return false
and avoid the callers to do extra work:

1) Having btrfs_del_dir_entries_in_log() and btrfs_del_inode_ref_in_log()
   unnecessarily join a log transaction and do deletion searches in a log
   tree that will not find anything. This just adds unnecessary contention
   on extent buffer locks;

2) Having btrfs_log_new_name() unnecessarily log an inode when it is not
   needed. If the inode was not logged before, we don't need to log it in
   LOG_INODE_EXISTS mode.

So just make sure that any false positive only happens when ->logged_trans
has a value of 0.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 191dea1d2416..d09202e0c9df 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3421,14 +3421,10 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 }
 
 /*
- * Check if an inode was logged in the current transaction. We can't always rely
- * on an inode's logged_trans value, because it's an in-memory only field and
- * therefore not persisted. This means that its value is lost if the inode gets
- * evicted and loaded again from disk (in which case it has a value of 0, and
- * certainly it is smaller then any possible transaction ID), when that happens
- * the full_sync flag is set in the inode's runtime flags, so on that case we
- * assume eviction happened and ignore the logged_trans value, assuming the
- * worst case, that the inode was logged before in the current transaction.
+ * Check if an inode was logged in the current transaction. This may often
+ * return some false positives, because logged_trans is an in memory only field,
+ * not persisted anywhere. This is meant to be used in contexts where a false
+ * positive has no functional consequences.
  */
 static bool inode_logged(struct btrfs_trans_handle *trans,
 			 struct btrfs_inode *inode)
@@ -3436,7 +3432,18 @@ static bool inode_logged(struct btrfs_trans_handle *trans,
 	if (inode->logged_trans == trans->transid)
 		return true;
 
-	if (inode->last_trans == trans->transid &&
+	/*
+	 * The inode's logged_trans is always 0 when we load it (because it is
+	 * not persisted in the inode item or elsewhere). So if it is 0, the
+	 * inode was last modified in the current transaction and has the
+	 * full_sync flag set, then the inode may have been logged before in
+	 * the current transaction, then evicted and loaded again in the current
+	 * transaction - or may have never been logged in the current transaction,
+	 * but since we can not be sure, we have to assume it was, otherwise our
+	 * callers can leave an inconsistent log.
+	 */
+	if (inode->logged_trans == 0 &&
+	    inode->last_trans == trans->transid &&
 	    test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
 	    !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
 		return true;
-- 
cgit v1.2.3


From bd54f381a12ac695593271a663d36d14220215b2 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 27 Jul 2021 11:24:45 +0100
Subject: btrfs: do not pin logs too early during renames

During renames we pin the logs of the roots a bit too early, before the
calls to btrfs_insert_inode_ref(). We can pin the logs after those calls,
since those will not change anything in a log tree.

In a scenario where we have multiple and diverse filesystem operations
running in parallel, those calls can take a significant amount of time,
due to lock contention on extent buffers, and delay log commits from other
tasks for longer than necessary.

So just pin logs after calls to btrfs_insert_inode_ref() and right before
the first operation that can update a log tree.

The following script that uses dbench was used for testing:

  $ cat dbench-test.sh
  #!/bin/bash

  DEV=/dev/nvme0n1
  MNT=/mnt/nvme0n1
  MOUNT_OPTIONS="-o ssd"
  MKFS_OPTIONS="-m single -d single"

  echo "performance" | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor

  umount $DEV &> /dev/null
  mkfs.btrfs -f $MKFS_OPTIONS $DEV
  mount $MOUNT_OPTIONS $DEV $MNT

  dbench -D $MNT -t 120 16

  umount $MNT

The tests were run on a machine with 12 cores, 64G of RAN, a NVMe device
and using a non-debug kernel config (Debian's default config).

The results compare a branch without this patch and without the previous
patch in the series, that has the subject:

 "btrfs: eliminate some false positives when checking if inode was logged"

Versus the same branch with these two patches applied.

dbench with 8 clients, results before:

 Operation      Count    AvgLat    MaxLat
 ----------------------------------------
 NTCreateX    4391359     0.009   249.745
 Close        3225882     0.001     3.243
 Rename        185953     0.065   240.643
 Unlink        886669     0.049   249.906
 Deltree          112     2.455   217.433
 Mkdir             56     0.002     0.004
 Qpathinfo    3980281     0.004     3.109
 Qfileinfo     697579     0.001     0.187
 Qfsinfo       729780     0.002     2.424
 Sfileinfo     357764     0.004     1.415
 Find         1538861     0.016     4.863
 WriteX       2189666     0.010     3.327
 ReadX        6883443     0.002     0.729
 LockX          14298     0.002     0.073
 UnlockX        14298     0.001     0.042
 Flush         307777     2.447   303.663

Throughput 1149.6 MB/sec  8 clients  8 procs  max_latency=303.666 ms

dbench with 8 clients, results after:

 Operation      Count    AvgLat    MaxLat
 ----------------------------------------
 NTCreateX    4269920     0.009   213.532
 Close        3136653     0.001     0.690
 Rename        180805     0.082   213.858
 Unlink        862189     0.050   172.893
 Deltree          112     2.998   218.328
 Mkdir             56     0.002     0.003
 Qpathinfo    3870158     0.004     5.072
 Qfileinfo     678375     0.001     0.194
 Qfsinfo       709604     0.002     0.485
 Sfileinfo     347850     0.004     1.304
 Find         1496310     0.017     5.504
 WriteX       2129613     0.010     2.882
 ReadX        6693066     0.002     1.517
 LockX          13902     0.002     0.075
 UnlockX        13902     0.001     0.055
 Flush         299276     2.511   220.189

Throughput 1187.33 MB/sec  8 clients  8 procs  max_latency=220.194 ms

+3.2% throughput, -31.8% max latency

dbench with 16 clients, results before:

 Operation      Count    AvgLat    MaxLat
 ----------------------------------------
 NTCreateX    5978334     0.028   156.507
 Close        4391598     0.001     1.345
 Rename        253136     0.241   155.057
 Unlink       1207220     0.182   257.344
 Deltree          160     6.123    36.277
 Mkdir             80     0.003     0.005
 Qpathinfo    5418817     0.012     6.867
 Qfileinfo     949929     0.001     0.941
 Qfsinfo       993560     0.002     1.386
 Sfileinfo     486904     0.004     2.829
 Find         2095088     0.059     8.164
 WriteX       2982319     0.017     9.029
 ReadX        9371484     0.002     4.052
 LockX          19470     0.002     0.461
 UnlockX        19470     0.001     0.990
 Flush         418936     2.740   347.902

Throughput 1495.31 MB/sec  16 clients  16 procs  max_latency=347.909 ms

dbench with 16 clients, results after:

 Operation      Count    AvgLat    MaxLat
 ----------------------------------------
 NTCreateX    5711833     0.029   131.240
 Close        4195897     0.001     1.732
 Rename        241849     0.204   147.831
 Unlink       1153341     0.184   231.322
 Deltree          160     6.086    30.198
 Mkdir             80     0.003     0.021
 Qpathinfo    5177011     0.012     7.150
 Qfileinfo     907768     0.001     0.793
 Qfsinfo       949205     0.002     1.431
 Sfileinfo     465317     0.004     2.454
 Find         2001541     0.058     7.819
 WriteX       2850661     0.017     9.110
 ReadX        8952289     0.002     3.991
 LockX          18596     0.002     0.655
 UnlockX        18596     0.001     0.179
 Flush         400342     2.879   293.607

Throughput 1565.73 MB/sec  16 clients  16 procs  max_latency=293.611 ms

+4.6% throughput, -16.9% max latency

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 48 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 42 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3ed88f2ac393..bfa3991e476b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9350,8 +9350,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 		/* force full log commit if subvolume involved. */
 		btrfs_set_log_full_commit(trans);
 	} else {
-		btrfs_pin_log_trans(root);
-		root_log_pinned = true;
 		ret = btrfs_insert_inode_ref(trans, dest,
 					     new_dentry->d_name.name,
 					     new_dentry->d_name.len,
@@ -9368,8 +9366,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 		/* force full log commit if subvolume involved. */
 		btrfs_set_log_full_commit(trans);
 	} else {
-		btrfs_pin_log_trans(dest);
-		dest_log_pinned = true;
 		ret = btrfs_insert_inode_ref(trans, root,
 					     old_dentry->d_name.name,
 					     old_dentry->d_name.len,
@@ -9400,6 +9396,29 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 				BTRFS_I(new_inode), 1);
 	}
 
+	/*
+	 * Now pin the logs of the roots. We do it to ensure that no other task
+	 * can sync the logs while we are in progress with the rename, because
+	 * that could result in an inconsistency in case any of the inodes that
+	 * are part of this rename operation were logged before.
+	 *
+	 * We pin the logs even if at this precise moment none of the inodes was
+	 * logged before. This is because right after we checked for that, some
+	 * other task fsyncing some other inode not involved with this rename
+	 * operation could log that one of our inodes exists.
+	 *
+	 * We don't need to pin the logs before the above calls to
+	 * btrfs_insert_inode_ref(), since those don't ever need to change a log.
+	 */
+	if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+		btrfs_pin_log_trans(root);
+		root_log_pinned = true;
+	}
+	if (new_ino != BTRFS_FIRST_FREE_OBJECTID) {
+		btrfs_pin_log_trans(dest);
+		dest_log_pinned = true;
+	}
+
 	/* src is a subvolume */
 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
 		ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
@@ -9652,8 +9671,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		/* force full log commit if subvolume involved. */
 		btrfs_set_log_full_commit(trans);
 	} else {
-		btrfs_pin_log_trans(root);
-		log_pinned = true;
 		ret = btrfs_insert_inode_ref(trans, dest,
 					     new_dentry->d_name.name,
 					     new_dentry->d_name.len,
@@ -9677,6 +9694,25 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
 		ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
 	} else {
+		/*
+		 * Now pin the log. We do it to ensure that no other task can
+		 * sync the log while we are in progress with the rename, as
+		 * that could result in an inconsistency in case any of the
+		 * inodes that are part of this rename operation were logged
+		 * before.
+		 *
+		 * We pin the log even if at this precise moment none of the
+		 * inodes was logged before. This is because right after we
+		 * checked for that, some other task fsyncing some other inode
+		 * not involved with this rename operation could log that one of
+		 * our inodes exists.
+		 *
+		 * We don't need to pin the logs before the above call to
+		 * btrfs_insert_inode_ref(), since that does not need to change
+		 * a log.
+		 */
+		btrfs_pin_log_trans(root);
+		log_pinned = true;
 		ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
 					BTRFS_I(d_inode(old_dentry)),
 					old_dentry->d_name.name,
-- 
cgit v1.2.3


From b2f78e88052bc0bee56bbf646d245fcfb431a873 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 22 Jul 2021 20:54:37 +0200
Subject: btrfs: allow degenerate raid0/raid10

The data on raid0 and raid10 are supposed to be spread over multiple
devices, so the minimum constraints are set to 2 and 4 respectively.
This is an artificial limit and there's some interest to remove it.

Change this to allow raid0 on one device and raid10 on two devices. This
works as expected eg. when converting or removing devices.

The only difference is when raid0 on two devices gets one device
removed. Unpatched would silently create a single profile, while newly
it would be raid0.

The motivation is to allow to preserve the profile type as long as it
possible for some intermediate state (device removal, conversion), or
when there are disks of different size, with raid0 the otherwise
unusable space of the last device will be used too. Similarly for
raid10, though the two largest devices would need to be the same.

Unpatched kernel will mount and use the degenerate profiles just fine
but won't allow any operation that would not satisfy the stricter device
number constraints, eg. not allowing to go from 3 to 2 devices for
raid10 or various profile conversions.

Example output:

  # btrfs fi us -T .
  Overall:
      Device size:                  10.00GiB
      Device allocated:              1.01GiB
      Device unallocated:            8.99GiB
      Device missing:                  0.00B
      Used:                        200.61MiB
      Free (estimated):              9.79GiB      (min: 9.79GiB)
      Free (statfs, df):             9.79GiB
      Data ratio:                       1.00
      Metadata ratio:                   1.00
      Global reserve:                3.25MiB      (used: 0.00B)
      Multiple profiles:                  no

		Data      Metadata  System
  Id Path       RAID0     single    single   Unallocated
  -- ---------- --------- --------- -------- -----------
   1 /dev/sda10   1.00GiB   8.00MiB  1.00MiB     8.99GiB
  -- ---------- --------- --------- -------- -----------
     Total        1.00GiB   8.00MiB  1.00MiB     8.99GiB
     Used       200.25MiB 352.00KiB 16.00KiB

  # btrfs dev us .
  /dev/sda10, ID: 1
     Device size:            10.00GiB
     Device slack:              0.00B
     Data,RAID0/1:            1.00GiB
     Metadata,single:         8.00MiB
     System,single:           1.00MiB
     Unallocated:             8.99GiB

Note "Data,RAID0/1", with btrfs-progs 5.13+ the number of devices per
profile is printed.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8e61307ffad2..8526cc39faa9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -38,7 +38,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 		.sub_stripes	= 2,
 		.dev_stripes	= 1,
 		.devs_max	= 0,	/* 0 == as many as possible */
-		.devs_min	= 4,
+		.devs_min	= 2,
 		.tolerated_failures = 1,
 		.devs_increment	= 2,
 		.ncopies	= 2,
@@ -103,7 +103,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 		.sub_stripes	= 1,
 		.dev_stripes	= 1,
 		.devs_max	= 0,
-		.devs_min	= 2,
+		.devs_min	= 1,
 		.tolerated_failures = 0,
 		.devs_increment	= 1,
 		.ncopies	= 1,
-- 
cgit v1.2.3


From 7361b4ae03d9f3325852eec1a7395ec5f1ff802d Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 28 Jul 2021 14:05:05 +0800
Subject: btrfs: remove the dead comment in writepage_delalloc()

When btrfs_run_delalloc_range() failed, we will error out.

But there is a strange comment mentioning that
btrfs_run_delalloc_range() could have returned value >0 to indicate the
IO has already started.

Commit 40f765805f08 ("Btrfs: split up __extent_writepage to lower stack
usage") introduced the comment, but unfortunately at that time, we were
already using @page_started to indicate that case, and still return 0.

Furthermore, even if that comment was right (which is not), we would
return -EIO if the IO had already started.

By all means the comment is incorrect, just remove the comment along
with the dead check.

Just to be extra safe, add an ASSERT() in btrfs_run_delalloc_range() to
make sure we either return 0 or error, no positive return value.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 8 +-------
 fs/btrfs/inode.c     | 1 +
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3abb9365fbe9..66888b10b00d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3803,13 +3803,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 		if (ret) {
 			btrfs_page_set_error(inode->root->fs_info, page,
 					     page_offset(page), PAGE_SIZE);
-			/*
-			 * btrfs_run_delalloc_range should return < 0 for error
-			 * but just in case, we use > 0 here meaning the IO is
-			 * started, so we don't want to return > 0 unless
-			 * things are going well.
-			 */
-			return ret < 0 ? ret : -EIO;
+			return ret;
 		}
 		/*
 		 * delalloc_end is already one less than the total length, so
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bfa3991e476b..949f6a6c616a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1962,6 +1962,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
 		ret = cow_file_range_async(inode, wbc, locked_page, start, end,
 					   page_started, nr_written);
 	}
+	ASSERT(ret <= 0);
 	if (ret)
 		btrfs_cleanup_ordered_extents(inode, locked_page, start,
 					      end - start + 1);
-- 
cgit v1.2.3


From efc222f8d79c0431c92ae8c72bc62d2e6bcf0a00 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Wed, 28 Jul 2021 07:03:05 +0800
Subject: btrfs: simplify return values in btrfs_check_raid_min_devices

Function btrfs_check_raid_min_devices() returns error code from the enum
btrfs_err_code and it starts from 1. So there is no need to check if ret
is > 0. So drop this check and also drop the local variable ret.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8526cc39faa9..230192d097c4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1949,12 +1949,8 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
 		if (!(all_avail & btrfs_raid_array[i].bg_flag))
 			continue;
 
-		if (num_devices < btrfs_raid_array[i].devs_min) {
-			int ret = btrfs_raid_array[i].mindev_error;
-
-			if (ret)
-				return ret;
-		}
+		if (num_devices < btrfs_raid_array[i].devs_min)
+			return btrfs_raid_array[i].mindev_error;
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 77eea05e7851d910b7992c8c237a6b5d462050da Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Wed, 30 Jun 2021 13:01:48 -0700
Subject: btrfs: add ro compat flags to inodes

Currently, inode flags are fully backwards incompatible in btrfs. If we
introduce a new inode flag, then tree-checker will detect it and fail.
This can even cause us to fail to mount entirely. To make it possible to
introduce new flags which can be read-only compatible, like VERITY, we
add new ro flags to btrfs without treating them quite so harshly in
tree-checker. A read-only file system can survive an unexpected flag,
and can be mounted.

As for the implementation, it unfortunately gets a little complicated.

The on-disk representation of the inode, btrfs_inode_item, has an __le64
for flags but the in-memory representation, btrfs_inode, uses a u32.
David Sterba had the nice idea that we could reclaim those wasted 32 bits
on disk and use them for the new ro_compat flags.

It turns out that the tree-checker code which checks for unknown flags
is broken, and ignores the upper 32 bits we are hoping to use. The issue
is that the flags use the literal 1 rather than 1ULL, so the flags are
signed ints, and one of them is specifically (1 << 31). As a result, the
mask which ORs the flags is a negative integer on machines where int is
32 bit twos complement. When tree-checker evaluates the expression:

  btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK)

The mask is something like 0x80000abc, which gets promoted to u64 with
sign extension to 0xffffffff80000abc. Negating that 64 bit mask leaves
all the upper bits zeroed, and we can't detect unexpected flags.

This suggests that we can't use those bits after all. Luckily, we have
good reason to believe that they are zero anyway. Inode flags are
metadata, which is always checksummed, so any bit flips that would
introduce 1s would cause a checksum failure anyway (excluding the
improbable case of the checksum getting corrupted exactly badly).

Further, unless the 1 << 31 flag is used, the cast to u64 of the 32 bit
inode flag should preserve its value and not add leading zeroes
(at least for twos complement). The only place that flag
(BTRFS_INODE_ROOT_ITEM_INIT) is used is in a special inode embedded in
the root item, and indeed for that inode we see 0xffffffff80000000 as
the flags on disk. However, that inode is never seen by tree checker,
nor is it used in a context where verity might be meaningful.
Theoretically, a future ro flag might cause trouble on that inode, so we
should proactively clean up that mess before it does.

With the introduction of the new ro flags, keep two separate unsigned
masks and check them against the appropriate u32. Since we no longer run
afoul of sign extension, this also stops writing out 0xffffffff80000000
in root_item inodes going forward.

Signed-off-by: Boris Burkov <boris@bur.io>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h   | 20 +++++++++++++++++++-
 fs/btrfs/ctree.h         | 30 ++++++++++++++++--------------
 fs/btrfs/delayed-inode.c |  9 +++++++--
 fs/btrfs/inode.c         |  9 +++++++--
 fs/btrfs/ioctl.c         |  7 ++++---
 fs/btrfs/tree-checker.c  | 17 +++++++++++++----
 fs/btrfs/tree-log.c      |  5 ++++-
 7 files changed, 70 insertions(+), 27 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index c652e19ad74e..1093b00130be 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -189,8 +189,10 @@ struct btrfs_inode {
 	 */
 	u64 csum_bytes;
 
-	/* flags field from the on disk inode */
+	/* Backwards incompatible flags, lower half of inode_item::flags  */
 	u32 flags;
+	/* Read-only compatibility flags, upper half of inode_item::flags */
+	u32 ro_flags;
 
 	/*
 	 * Counters to keep track of the number of extent item's we may use due
@@ -348,6 +350,22 @@ struct btrfs_dio_private {
 	u8 csums[];
 };
 
+/*
+ * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two
+ * separate u32s. These two functions convert between the two representations.
+ */
+static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags)
+{
+	return (flags | ((u64)ro_flags << 32));
+}
+
+static inline void btrfs_inode_split_flags(u64 inode_item_flags,
+					   u32 *flags, u32 *ro_flags)
+{
+	*flags = (u32)inode_item_flags;
+	*ro_flags = (u32)(inode_item_flags >> 32);
+}
+
 /* Array of bytes with variable length, hexadecimal format 0x1234 */
 #define CSUM_FMT				"0x%*phN"
 #define CSUM_FMT_VALUE(size, bytes)		size, bytes
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fd3084feb4b5..9e3b7a56a78f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1482,20 +1482,20 @@ do {                                                                   \
 /*
  * Inode flags
  */
-#define BTRFS_INODE_NODATASUM		(1 << 0)
-#define BTRFS_INODE_NODATACOW		(1 << 1)
-#define BTRFS_INODE_READONLY		(1 << 2)
-#define BTRFS_INODE_NOCOMPRESS		(1 << 3)
-#define BTRFS_INODE_PREALLOC		(1 << 4)
-#define BTRFS_INODE_SYNC		(1 << 5)
-#define BTRFS_INODE_IMMUTABLE		(1 << 6)
-#define BTRFS_INODE_APPEND		(1 << 7)
-#define BTRFS_INODE_NODUMP		(1 << 8)
-#define BTRFS_INODE_NOATIME		(1 << 9)
-#define BTRFS_INODE_DIRSYNC		(1 << 10)
-#define BTRFS_INODE_COMPRESS		(1 << 11)
-
-#define BTRFS_INODE_ROOT_ITEM_INIT	(1 << 31)
+#define BTRFS_INODE_NODATASUM		(1U << 0)
+#define BTRFS_INODE_NODATACOW		(1U << 1)
+#define BTRFS_INODE_READONLY		(1U << 2)
+#define BTRFS_INODE_NOCOMPRESS		(1U << 3)
+#define BTRFS_INODE_PREALLOC		(1U << 4)
+#define BTRFS_INODE_SYNC		(1U << 5)
+#define BTRFS_INODE_IMMUTABLE		(1U << 6)
+#define BTRFS_INODE_APPEND		(1U << 7)
+#define BTRFS_INODE_NODUMP		(1U << 8)
+#define BTRFS_INODE_NOATIME		(1U << 9)
+#define BTRFS_INODE_DIRSYNC		(1U << 10)
+#define BTRFS_INODE_COMPRESS		(1U << 11)
+
+#define BTRFS_INODE_ROOT_ITEM_INIT	(1U << 31)
 
 #define BTRFS_INODE_FLAG_MASK						\
 	(BTRFS_INODE_NODATASUM |					\
@@ -1512,6 +1512,8 @@ do {                                                                   \
 	 BTRFS_INODE_COMPRESS |						\
 	 BTRFS_INODE_ROOT_ITEM_INIT)
 
+#define BTRFS_INODE_RO_FLAG_MASK					(0)
+
 struct btrfs_map_token {
 	struct extent_buffer *eb;
 	char *kaddr;
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 61452f04181a..1e08eb2b27f0 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1645,6 +1645,8 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
 				  struct btrfs_inode_item *inode_item,
 				  struct inode *inode)
 {
+	u64 flags;
+
 	btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode));
 	btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode));
 	btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
@@ -1657,7 +1659,9 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
 				       inode_peek_iversion(inode));
 	btrfs_set_stack_inode_transid(inode_item, trans->transid);
 	btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
-	btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
+	flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+					  BTRFS_I(inode)->ro_flags);
+	btrfs_set_stack_inode_flags(inode_item, flags);
 	btrfs_set_stack_inode_block_group(inode_item, 0);
 
 	btrfs_set_stack_timespec_sec(&inode_item->atime,
@@ -1715,7 +1719,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
 				   btrfs_stack_inode_sequence(inode_item));
 	inode->i_rdev = 0;
 	*rdev = btrfs_stack_inode_rdev(inode_item);
-	BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
+	btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item),
+				&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
 
 	inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
 	inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 949f6a6c616a..cd5a67ba7e71 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3764,7 +3764,8 @@ static int btrfs_read_locked_inode(struct inode *inode,
 	rdev = btrfs_inode_rdev(leaf, inode_item);
 
 	BTRFS_I(inode)->index_cnt = (u64)-1;
-	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+	btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
+				&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
 
 cache_index:
 	/*
@@ -3895,6 +3896,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 			    struct inode *inode)
 {
 	struct btrfs_map_token token;
+	u64 flags;
 
 	btrfs_init_map_token(&token, leaf);
 
@@ -3930,7 +3932,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
 	btrfs_set_token_inode_transid(&token, item, trans->transid);
 	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
-	btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
+	flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+					  BTRFS_I(inode)->ro_flags);
+	btrfs_set_token_inode_flags(&token, item, flags);
 	btrfs_set_token_inode_block_group(&token, item, 0);
 }
 
@@ -9064,6 +9068,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->defrag_bytes = 0;
 	ei->disk_i_size = 0;
 	ei->flags = 0;
+	ei->ro_flags = 0;
 	ei->csum_bytes = 0;
 	ei->index_cnt = (u64)-1;
 	ei->dir_index = 0;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4d809899c076..17aefb5f08ea 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -103,9 +103,10 @@ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
  * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
  * ioctl.
  */
-static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags)
+static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
 {
 	unsigned int iflags = 0;
+	u32 flags = binode->flags;
 
 	if (flags & BTRFS_INODE_SYNC)
 		iflags |= FS_SYNC_FL;
@@ -200,7 +201,7 @@ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
 {
 	struct btrfs_inode *binode = BTRFS_I(d_inode(dentry));
 
-	fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode->flags));
+	fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode));
 	return 0;
 }
 
@@ -224,7 +225,7 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns,
 		return -EOPNOTSUPP;
 
 	fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags);
-	old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
+	old_fsflags = btrfs_inode_flags_to_fsflags(binode);
 	ret = check_fsflags(old_fsflags, fsflags);
 	if (ret)
 		return ret;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 7ba94b683ee3..7733e8ac0a69 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -24,6 +24,7 @@
 #include "compression.h"
 #include "volumes.h"
 #include "misc.h"
+#include "btrfs_inode.h"
 
 /*
  * Error message should follow the following format:
@@ -1008,6 +1009,8 @@ static int check_inode_item(struct extent_buffer *leaf,
 	u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777);
 	u32 mode;
 	int ret;
+	u32 flags;
+	u32 ro_flags;
 
 	ret = check_inode_key(leaf, key, slot);
 	if (unlikely(ret < 0))
@@ -1063,11 +1066,17 @@ static int check_inode_item(struct extent_buffer *leaf,
 			btrfs_inode_nlink(leaf, iitem));
 		return -EUCLEAN;
 	}
-	if (unlikely(btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK)) {
+	btrfs_inode_split_flags(btrfs_inode_flags(leaf, iitem), &flags, &ro_flags);
+	if (unlikely(flags & ~BTRFS_INODE_FLAG_MASK)) {
 		inode_item_err(leaf, slot,
-			       "unknown flags detected: 0x%llx",
-			       btrfs_inode_flags(leaf, iitem) &
-			       ~BTRFS_INODE_FLAG_MASK);
+			       "unknown incompat flags detected: 0x%x", flags);
+		return -EUCLEAN;
+	}
+	if (unlikely(!sb_rdonly(fs_info->sb) &&
+		     (ro_flags & ~BTRFS_INODE_RO_FLAG_MASK))) {
+		inode_item_err(leaf, slot,
+			"unknown ro-compat flags detected on writeable mount: 0x%x",
+			ro_flags);
 		return -EUCLEAN;
 	}
 	return 0;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d09202e0c9df..567adc3de11a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3924,6 +3924,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 			    u64 logged_isize)
 {
 	struct btrfs_map_token token;
+	u64 flags;
 
 	btrfs_init_map_token(&token, leaf);
 
@@ -3973,7 +3974,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
 	btrfs_set_token_inode_transid(&token, item, trans->transid);
 	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
-	btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
+	flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+					  BTRFS_I(inode)->ro_flags);
+	btrfs_set_token_inode_flags(&token, item, flags);
 	btrfs_set_token_inode_block_group(&token, item, 0);
 }
 
-- 
cgit v1.2.3


From 146054090b0859b28fc39015c7704ccc3c3a347f Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Wed, 30 Jun 2021 13:01:49 -0700
Subject: btrfs: initial fsverity support

Add support for fsverity in btrfs. To support the generic interface in
fs/verity, we add two new item types in the fs tree for inodes with
verity enabled. One stores the per-file verity descriptor and btrfs
verity item and the other stores the Merkle tree data itself.

Verity checking is done in end_page_read just before a page is marked
uptodate. This naturally handles a variety of edge cases like holes,
preallocated extents, and inline extents. Some care needs to be taken to
not try to verity pages past the end of the file, which are accessed by
the generic buffered file reading code under some circumstances like
reading to the end of the last page and trying to read again. Direct IO
on a verity file falls back to buffered reads.

Verity relies on PageChecked for the Merkle tree data itself to avoid
re-walking up shared paths in the tree. For this reason, we need to
cache the Merkle tree data. Since the file is immutable after verity is
turned on, we can cache it at an index past EOF.

Use the new inode ro_flags to store verity on the inode item, so that we
can enable verity on a file, then rollback to an older kernel and still
mount the file system and read the file. Since we can't safely write the
file anymore without ruining the invariants of the Merkle tree, we mark
a ro_compat flag on the file system when a file has verity enabled.

Acked-by: Eric Biggers <ebiggers@google.com>
Co-developed-by: Chris Mason <clm@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Boris Burkov <boris@bur.io>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/Makefile               |   1 +
 fs/btrfs/btrfs_inode.h          |   7 +
 fs/btrfs/ctree.h                |  31 +-
 fs/btrfs/extent_io.c            |  26 +-
 fs/btrfs/file.c                 |  10 +
 fs/btrfs/inode.c                |   6 +
 fs/btrfs/ioctl.c                |  14 +-
 fs/btrfs/super.c                |   3 +
 fs/btrfs/sysfs.c                |   6 +
 fs/btrfs/verity.c               | 738 ++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/btrfs.h      |   1 +
 include/uapi/linux/btrfs_tree.h |  35 ++
 12 files changed, 859 insertions(+), 19 deletions(-)
 create mode 100644 fs/btrfs/verity.c

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index cec88a66bd6c..3dcf9bcc2326 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -36,6 +36,7 @@ btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
 btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
 btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o
+btrfs-$(CONFIG_FS_VERITY) += verity.o
 
 btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
 	tests/extent-buffer-tests.o tests/btrfs-tests.o \
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 1093b00130be..76ee1452c57b 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -51,6 +51,13 @@ enum {
 	 * the file range, inode's io_tree).
 	 */
 	BTRFS_INODE_NO_DELALLOC_FLUSH,
+	/*
+	 * Set when we are working on enabling verity for a file. Computing and
+	 * writing the whole Merkle tree can take a while so we want to prevent
+	 * races where two separate tasks attempt to simultaneously start verity
+	 * on the same file.
+	 */
+	BTRFS_INODE_VERITY_IN_PROGRESS,
 };
 
 /* in memory btrfs inode */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9e3b7a56a78f..f17be4b023cb 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -281,7 +281,8 @@ struct btrfs_super_block {
 
 #define BTRFS_FEATURE_COMPAT_RO_SUPP			\
 	(BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE |	\
-	 BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID)
+	 BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \
+	 BTRFS_FEATURE_COMPAT_RO_VERITY)
 
 #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET	0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR	0ULL
@@ -1512,7 +1513,9 @@ do {                                                                   \
 	 BTRFS_INODE_COMPRESS |						\
 	 BTRFS_INODE_ROOT_ITEM_INIT)
 
-#define BTRFS_INODE_RO_FLAG_MASK					(0)
+#define BTRFS_INODE_RO_VERITY		(1U << 0)
+
+#define BTRFS_INODE_RO_FLAG_MASK	(BTRFS_INODE_RO_VERITY)
 
 struct btrfs_map_token {
 	struct extent_buffer *eb;
@@ -3791,6 +3794,30 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
 	return signal_pending(current);
 }
 
+/* verity.c */
+#ifdef CONFIG_FS_VERITY
+
+extern const struct fsverity_operations btrfs_verityops;
+int btrfs_drop_verity_items(struct btrfs_inode *inode);
+
+BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item,
+		   encryption, 8);
+BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item,
+		   size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption,
+			 struct btrfs_verity_descriptor_item, encryption, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size,
+			 struct btrfs_verity_descriptor_item, size, 64);
+
+#else
+
+static inline int btrfs_drop_verity_items(struct btrfs_inode *inode)
+{
+	return 0;
+}
+
+#endif
+
 /* Sanity test specific functions */
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 void btrfs_test_destroy_inode(struct inode *inode);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 66888b10b00d..96de6e70d06c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -13,6 +13,7 @@
 #include <linux/pagevec.h>
 #include <linux/prefetch.h>
 #include <linux/cleancache.h>
+#include <linux/fsverity.h>
 #include "misc.h"
 #include "extent_io.h"
 #include "extent-io-tree.h"
@@ -2247,18 +2248,6 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	return bitset;
 }
 
-/*
- * helper function to set a given page up to date if all the
- * extents in the tree for that page are up to date
- */
-static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
-{
-	u64 start = page_offset(page);
-	u64 end = start + PAGE_SIZE - 1;
-	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
-		SetPageUptodate(page);
-}
-
 int free_io_failure(struct extent_io_tree *failure_tree,
 		    struct extent_io_tree *io_tree,
 		    struct io_failure_record *rec)
@@ -2690,7 +2679,15 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
 	       start + len <= page_offset(page) + PAGE_SIZE);
 
 	if (uptodate) {
-		btrfs_page_set_uptodate(fs_info, page, start, len);
+		if (fsverity_active(page->mapping->host) &&
+		    !PageError(page) &&
+		    !PageUptodate(page) &&
+		    start < i_size_read(page->mapping->host) &&
+		    !fsverity_verify_page(page)) {
+			btrfs_page_set_error(fs_info, page, start, len);
+		} else {
+			btrfs_page_set_uptodate(fs_info, page, start, len);
+		}
 	} else {
 		btrfs_page_clear_uptodate(fs_info, page, start, len);
 		btrfs_page_set_error(fs_info, page, start, len);
@@ -3105,7 +3102,7 @@ readpage_ok:
 		/* Update page status and unlock */
 		end_page_read(page, uptodate, start, len);
 		endio_readpage_release_extent(&processed, BTRFS_I(inode),
-					      start, end, uptodate);
+					      start, end, PageUptodate(page));
 	}
 	/* Release the last extent */
 	endio_readpage_release_extent(&processed, NULL, 0, 0, false);
@@ -3706,7 +3703,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 		/* the get_extent function already copied into the page */
 		if (test_range_bit(tree, cur, cur_end,
 				   EXTENT_UPTODATE, 1, NULL)) {
-			check_page_uptodate(tree, page);
 			unlock_extent(tree, cur, cur + iosize - 1);
 			end_page_read(page, true, cur, iosize);
 			cur = cur + iosize;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d3f2623a2af0..7ff577005d0f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -16,6 +16,7 @@
 #include <linux/btrfs.h>
 #include <linux/uio.h>
 #include <linux/iversion.h>
+#include <linux/fsverity.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -3615,7 +3616,13 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
 
 static int btrfs_file_open(struct inode *inode, struct file *filp)
 {
+	int ret;
+
 	filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
+
+	ret = fsverity_file_open(inode, filp);
+	if (ret)
+		return ret;
 	return generic_file_open(inode, filp);
 }
 
@@ -3644,6 +3651,9 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
 	struct inode *inode = file_inode(iocb->ki_filp);
 	ssize_t ret;
 
+	if (fsverity_active(inode))
+		return 0;
+
 	if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
 		return 0;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index cd5a67ba7e71..766cd35be33d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -32,6 +32,7 @@
 #include <linux/sched/mm.h>
 #include <linux/iomap.h>
 #include <asm/unaligned.h>
+#include <linux/fsverity.h>
 #include "misc.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -5560,6 +5561,7 @@ void btrfs_evict_inode(struct inode *inode)
 	trace_btrfs_inode_evict(inode);
 
 	if (!root) {
+		fsverity_cleanup_inode(inode);
 		clear_inode(inode);
 		return;
 	}
@@ -5642,6 +5644,7 @@ no_delete:
 	 * to retry these periodically in the future.
 	 */
 	btrfs_remove_delayed_node(BTRFS_I(inode));
+	fsverity_cleanup_inode(inode);
 	clear_inode(inode);
 }
 
@@ -9250,6 +9253,7 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,
 	struct inode *inode = d_inode(path->dentry);
 	u32 blocksize = inode->i_sb->s_blocksize;
 	u32 bi_flags = BTRFS_I(inode)->flags;
+	u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
 
 	stat->result_mask |= STATX_BTIME;
 	stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
@@ -9262,6 +9266,8 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,
 		stat->attributes |= STATX_ATTR_IMMUTABLE;
 	if (bi_flags & BTRFS_INODE_NODUMP)
 		stat->attributes |= STATX_ATTR_NODUMP;
+	if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
+		stat->attributes |= STATX_ATTR_VERITY;
 
 	stat->attributes_mask |= (STATX_ATTR_APPEND |
 				  STATX_ATTR_COMPRESSED |
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 17aefb5f08ea..85c8b5a87a6a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -27,6 +27,7 @@
 #include <linux/uaccess.h>
 #include <linux/iversion.h>
 #include <linux/fileattr.h>
+#include <linux/fsverity.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "export.h"
@@ -107,6 +108,7 @@ static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
 {
 	unsigned int iflags = 0;
 	u32 flags = binode->flags;
+	u32 ro_flags = binode->ro_flags;
 
 	if (flags & BTRFS_INODE_SYNC)
 		iflags |= FS_SYNC_FL;
@@ -122,6 +124,8 @@ static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
 		iflags |= FS_DIRSYNC_FL;
 	if (flags & BTRFS_INODE_NODATACOW)
 		iflags |= FS_NOCOW_FL;
+	if (ro_flags & BTRFS_INODE_RO_VERITY)
+		iflags |= FS_VERITY_FL;
 
 	if (flags & BTRFS_INODE_NOCOMPRESS)
 		iflags |= FS_NOCOMP_FL;
@@ -149,10 +153,12 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
 		new_fl |= S_NOATIME;
 	if (binode->flags & BTRFS_INODE_DIRSYNC)
 		new_fl |= S_DIRSYNC;
+	if (binode->ro_flags & BTRFS_INODE_RO_VERITY)
+		new_fl |= S_VERITY;
 
 	set_mask_bits(&inode->i_flags,
-		      S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC,
-		      new_fl);
+		      S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC |
+		      S_VERITY, new_fl);
 }
 
 /*
@@ -5020,6 +5026,10 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_get_subvol_rootref(file, argp);
 	case BTRFS_IOC_INO_LOOKUP_USER:
 		return btrfs_ioctl_ino_lookup_user(file, argp);
+	case FS_IOC_ENABLE_VERITY:
+		return fsverity_ioctl_enable(file, (const void __user *)argp);
+	case FS_IOC_MEASURE_VERITY:
+		return fsverity_ioctl_measure(file, argp);
 	}
 
 	return -ENOTTY;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0d2e3ab2fc31..2bdc544b4c95 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1353,6 +1353,9 @@ static int btrfs_fill_super(struct super_block *sb,
 	sb->s_op = &btrfs_super_ops;
 	sb->s_d_op = &btrfs_dentry_operations;
 	sb->s_export_op = &btrfs_export_ops;
+#ifdef CONFIG_FS_VERITY
+	sb->s_vop = &btrfs_verityops;
+#endif
 	sb->s_xattr = btrfs_xattr_handlers;
 	sb->s_time_gran = 1;
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index d9d53a255ef9..bfe5e27617b0 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -267,6 +267,9 @@ BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
 #ifdef CONFIG_BTRFS_DEBUG
 BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
 #endif
+#ifdef CONFIG_FS_VERITY
+BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
+#endif
 
 static struct attribute *btrfs_supported_feature_attrs[] = {
 	BTRFS_FEAT_ATTR_PTR(mixed_backref),
@@ -284,6 +287,9 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
 	BTRFS_FEAT_ATTR_PTR(raid1c34),
 #ifdef CONFIG_BTRFS_DEBUG
 	BTRFS_FEAT_ATTR_PTR(zoned),
+#endif
+#ifdef CONFIG_FS_VERITY
+	BTRFS_FEAT_ATTR_PTR(verity),
 #endif
 	NULL
 };
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
new file mode 100644
index 000000000000..ac4c2ca45925
--- /dev/null
+++ b/fs/btrfs/verity.c
@@ -0,0 +1,738 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/iversion.h>
+#include <linux/fsverity.h>
+#include <linux/sched/mm.h>
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+
+/*
+ * Implementation of the interface defined in struct fsverity_operations.
+ *
+ * The main question is how and where to store the verity descriptor and the
+ * Merkle tree. We store both in dedicated btree items in the filesystem tree,
+ * together with the rest of the inode metadata. This means we'll need to do
+ * extra work to encrypt them once encryption is supported in btrfs, but btrfs
+ * has a lot of careful code around i_size and it seems better to make a new key
+ * type than try and adjust all of our expectations for i_size.
+ *
+ * Note that this differs from the implementation in ext4 and f2fs, where
+ * this data is stored as if it were in the file, but past EOF. However, btrfs
+ * does not have a widespread mechanism for caching opaque metadata pages, so we
+ * do pretend that the Merkle tree pages themselves are past EOF for the
+ * purposes of caching them (as opposed to creating a virtual inode).
+ *
+ * fs verity items are stored under two different key types on disk.
+ * The descriptor items:
+ * [ inode objectid, BTRFS_VERITY_DESC_ITEM_KEY, offset ]
+ *
+ * At offset 0, we store a btrfs_verity_descriptor_item which tracks the
+ * size of the descriptor item and some extra data for encryption.
+ * Starting at offset 1, these hold the generic fs verity descriptor.
+ * The latter are opaque to btrfs, we just read and write them as a blob for
+ * the higher level verity code.  The most common descriptor size is 256 bytes.
+ *
+ * The merkle tree items:
+ * [ inode objectid, BTRFS_VERITY_MERKLE_ITEM_KEY, offset ]
+ *
+ * These also start at offset 0, and correspond to the merkle tree bytes.
+ * So when fsverity asks for page 0 of the merkle tree, we pull up one page
+ * starting at offset 0 for this key type.  These are also opaque to btrfs,
+ * we're blindly storing whatever fsverity sends down.
+ */
+
+#define MERKLE_START_ALIGN			65536
+
+/*
+ * Compute the logical file offset where we cache the Merkle tree.
+ *
+ * @inode:  inode of the verity file
+ *
+ * For the purposes of caching the Merkle tree pages, as required by
+ * fs-verity, it is convenient to do size computations in terms of a file
+ * offset, rather than in terms of page indices.
+ *
+ * Use 64K to be sure it's past the last page in the file, even with 64K pages.
+ * That rounding operation itself can overflow loff_t, so we do it in u64 and
+ * check.
+ *
+ * Returns the file offset on success, negative error code on failure.
+ */
+static loff_t merkle_file_pos(const struct inode *inode)
+{
+	u64 sz = inode->i_size;
+	u64 rounded = round_up(sz, MERKLE_START_ALIGN);
+
+	if (rounded > inode->i_sb->s_maxbytes)
+		return -EFBIG;
+
+	return rounded;
+}
+
+/*
+ * Drop all the items for this inode with this key_type.
+ *
+ * @inode:     inode to drop items for
+ * @key_type:  type of items to drop (BTRFS_VERITY_DESC_ITEM or
+ *             BTRFS_VERITY_MERKLE_ITEM)
+ *
+ * Before doing a verity enable we cleanup any existing verity items.
+ * This is also used to clean up if a verity enable failed half way through.
+ *
+ * Returns number of dropped items on success, negative error code on failure.
+ */
+static int drop_verity_items(struct btrfs_inode *inode, u8 key_type)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = inode->root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int count = 0;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	while (1) {
+		/* 1 for the item being dropped */
+		trans = btrfs_start_transaction(root, 1);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			goto out;
+		}
+
+		/*
+		 * Walk backwards through all the items until we find one that
+		 * isn't from our key type or objectid
+		 */
+		key.objectid = btrfs_ino(inode);
+		key.type = key_type;
+		key.offset = (u64)-1;
+
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret > 0) {
+			ret = 0;
+			/* No more keys of this type, we're done */
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		} else if (ret < 0) {
+			btrfs_end_transaction(trans);
+			goto out;
+		}
+
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+		/* No more keys of this type, we're done */
+		if (key.objectid != btrfs_ino(inode) || key.type != key_type)
+			break;
+
+		/*
+		 * This shouldn't be a performance sensitive function because
+		 * it's not used as part of truncate.  If it ever becomes
+		 * perf sensitive, change this to walk forward and bulk delete
+		 * items
+		 */
+		ret = btrfs_del_items(trans, root, path, path->slots[0], 1);
+		if (ret) {
+			btrfs_end_transaction(trans);
+			goto out;
+		}
+		count++;
+		btrfs_release_path(path);
+		btrfs_end_transaction(trans);
+	}
+	ret = count;
+	btrfs_end_transaction(trans);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * Drop all verity items
+ *
+ * @inode:  inode to drop verity items for
+ *
+ * In most contexts where we are dropping verity items, we want to do it for all
+ * the types of verity items, not a particular one.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+int btrfs_drop_verity_items(struct btrfs_inode *inode)
+{
+	int ret;
+
+	ret = drop_verity_items(inode, BTRFS_VERITY_DESC_ITEM_KEY);
+	if (ret < 0)
+		return ret;
+	ret = drop_verity_items(inode, BTRFS_VERITY_MERKLE_ITEM_KEY);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+/*
+ * Insert and write inode items with a given key type and offset.
+ *
+ * @inode:     inode to insert for
+ * @key_type:  key type to insert
+ * @offset:    item offset to insert at
+ * @src:       source data to write
+ * @len:       length of source data to write
+ *
+ * Write len bytes from src into items of up to 2K length.
+ * The inserted items will have key (ino, key_type, offset + off) where off is
+ * consecutively increasing from 0 up to the last item ending at offset + len.
+ *
+ * Returns 0 on success and a negative error code on failure.
+ */
+static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
+			   const char *src, u64 len)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_path *path;
+	struct btrfs_root *root = inode->root;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	unsigned long copy_bytes;
+	unsigned long src_offset = 0;
+	void *data;
+	int ret = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	while (len > 0) {
+		/* 1 for the new item being inserted */
+		trans = btrfs_start_transaction(root, 1);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			break;
+		}
+
+		key.objectid = btrfs_ino(inode);
+		key.type = key_type;
+		key.offset = offset;
+
+		/*
+		 * Insert 2K at a time mostly to be friendly for smaller leaf
+		 * size filesystems
+		 */
+		copy_bytes = min_t(u64, len, 2048);
+
+		ret = btrfs_insert_empty_item(trans, root, path, &key, copy_bytes);
+		if (ret) {
+			btrfs_end_transaction(trans);
+			break;
+		}
+
+		leaf = path->nodes[0];
+
+		data = btrfs_item_ptr(leaf, path->slots[0], void);
+		write_extent_buffer(leaf, src + src_offset,
+				    (unsigned long)data, copy_bytes);
+		offset += copy_bytes;
+		src_offset += copy_bytes;
+		len -= copy_bytes;
+
+		btrfs_release_path(path);
+		btrfs_end_transaction(trans);
+	}
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * Read inode items of the given key type and offset from the btree.
+ *
+ * @inode:      inode to read items of
+ * @key_type:   key type to read
+ * @offset:     item offset to read from
+ * @dest:       Buffer to read into. This parameter has slightly tricky
+ *              semantics.  If it is NULL, the function will not do any copying
+ *              and will just return the size of all the items up to len bytes.
+ *              If dest_page is passed, then the function will kmap_local the
+ *              page and ignore dest, but it must still be non-NULL to avoid the
+ *              counting-only behavior.
+ * @len:        length in bytes to read
+ * @dest_page:  copy into this page instead of the dest buffer
+ *
+ * Helper function to read items from the btree.  This returns the number of
+ * bytes read or < 0 for errors.  We can return short reads if the items don't
+ * exist on disk or aren't big enough to fill the desired length.  Supports
+ * reading into a provided buffer (dest) or into the page cache
+ *
+ * Returns number of bytes read or a negative error code on failure.
+ */
+static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
+			  char *dest, u64 len, struct page *dest_page)
+{
+	struct btrfs_path *path;
+	struct btrfs_root *root = inode->root;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	u64 item_end;
+	u64 copy_end;
+	int copied = 0;
+	u32 copy_offset;
+	unsigned long copy_bytes;
+	unsigned long dest_offset = 0;
+	void *data;
+	char *kaddr = dest;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	if (dest_page)
+		path->reada = READA_FORWARD;
+
+	key.objectid = btrfs_ino(inode);
+	key.type = key_type;
+	key.offset = offset;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0) {
+		goto out;
+	} else if (ret > 0) {
+		ret = 0;
+		if (path->slots[0] == 0)
+			goto out;
+		path->slots[0]--;
+	}
+
+	while (len > 0) {
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+		if (key.objectid != btrfs_ino(inode) || key.type != key_type)
+			break;
+
+		item_end = btrfs_item_size_nr(leaf, path->slots[0]) + key.offset;
+
+		if (copied > 0) {
+			/*
+			 * Once we've copied something, we want all of the items
+			 * to be sequential
+			 */
+			if (key.offset != offset)
+				break;
+		} else {
+			/*
+			 * Our initial offset might be in the middle of an
+			 * item.  Make sure it all makes sense.
+			 */
+			if (key.offset > offset)
+				break;
+			if (item_end <= offset)
+				break;
+		}
+
+		/* desc = NULL to just sum all the item lengths */
+		if (!dest)
+			copy_end = item_end;
+		else
+			copy_end = min(offset + len, item_end);
+
+		/* Number of bytes in this item we want to copy */
+		copy_bytes = copy_end - offset;
+
+		/* Offset from the start of item for copying */
+		copy_offset = offset - key.offset;
+
+		if (dest) {
+			if (dest_page)
+				kaddr = kmap_local_page(dest_page);
+
+			data = btrfs_item_ptr(leaf, path->slots[0], void);
+			read_extent_buffer(leaf, kaddr + dest_offset,
+					   (unsigned long)data + copy_offset,
+					   copy_bytes);
+
+			if (dest_page)
+				kunmap_local(kaddr);
+		}
+
+		offset += copy_bytes;
+		dest_offset += copy_bytes;
+		len -= copy_bytes;
+		copied += copy_bytes;
+
+		path->slots[0]++;
+		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+			/*
+			 * We've reached the last slot in this leaf and we need
+			 * to go to the next leaf.
+			 */
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0) {
+				break;
+			} else if (ret > 0) {
+				ret = 0;
+				break;
+			}
+		}
+	}
+out:
+	btrfs_free_path(path);
+	if (!ret)
+		ret = copied;
+	return ret;
+}
+
+/*
+ * Rollback in-progress verity if we encounter an error.
+ *
+ * @inode:  inode verity had an error for
+ *
+ * We try to handle recoverable errors while enabling verity by rolling it back
+ * and just failing the operation, rather than having an fs level error no
+ * matter what. However, any error in rollback is unrecoverable.
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int rollback_verity(struct btrfs_inode *inode)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = inode->root;
+	int ret;
+
+	ASSERT(inode_is_locked(&inode->vfs_inode));
+	truncate_inode_pages(inode->vfs_inode.i_mapping, inode->vfs_inode.i_size);
+	clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+	ret = btrfs_drop_verity_items(inode);
+	if (ret) {
+		btrfs_handle_fs_error(root->fs_info, ret,
+				"failed to drop verity items in rollback %llu",
+				(u64)inode->vfs_inode.i_ino);
+		goto out;
+	}
+
+	/* 1 for updating the inode flag */
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		btrfs_handle_fs_error(root->fs_info, ret,
+			"failed to start transaction in verity rollback %llu",
+			(u64)inode->vfs_inode.i_ino);
+		goto out;
+	}
+	inode->ro_flags &= ~BTRFS_INODE_RO_VERITY;
+	btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+	ret = btrfs_update_inode(trans, root, inode);
+	if (ret) {
+		btrfs_abort_transaction(trans, ret);
+		goto out;
+	}
+	btrfs_end_transaction(trans);
+out:
+	return ret;
+}
+
+/*
+ * Finalize making the file a valid verity file
+ *
+ * @inode:      inode to be marked as verity
+ * @desc:       contents of the verity descriptor to write (not NULL)
+ * @desc_size:  size of the verity descriptor
+ *
+ * Do the actual work of finalizing verity after successfully writing the Merkle
+ * tree:
+ *
+ * - write out the descriptor items
+ * - mark the inode with the verity flag
+ * - mark the ro compat bit
+ * - clear the in progress bit
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int finish_verity(struct btrfs_inode *inode, const void *desc,
+			 size_t desc_size)
+{
+	struct btrfs_trans_handle *trans = NULL;
+	struct btrfs_root *root = inode->root;
+	struct btrfs_verity_descriptor_item item;
+	int ret;
+
+	/* Write out the descriptor item */
+	memset(&item, 0, sizeof(item));
+	btrfs_set_stack_verity_descriptor_size(&item, desc_size);
+	ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 0,
+			      (const char *)&item, sizeof(item));
+	if (ret)
+		goto out;
+
+	/* Write out the descriptor itself */
+	ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 1,
+			      desc, desc_size);
+	if (ret)
+		goto out;
+
+	/* 1 for updating the inode flag */
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+	inode->ro_flags |= BTRFS_INODE_RO_VERITY;
+	btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+	ret = btrfs_update_inode(trans, root, inode);
+	if (ret)
+		goto end_trans;
+	clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+	btrfs_set_fs_compat_ro(root->fs_info, VERITY);
+end_trans:
+	btrfs_end_transaction(trans);
+out:
+	return ret;
+
+}
+
+/*
+ * fsverity op that begins enabling verity.
+ *
+ * @filp:  file to enable verity on
+ *
+ * Begin enabling fsverity for the file. We drop any existing verity items
+ * and set the in progress bit.
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int btrfs_begin_enable_verity(struct file *filp)
+{
+	struct btrfs_inode *inode = BTRFS_I(file_inode(filp));
+	int ret;
+
+	ASSERT(inode_is_locked(file_inode(filp)));
+
+	if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags))
+		return -EBUSY;
+
+	ret = btrfs_drop_verity_items(inode);
+	if (ret)
+		return ret;
+
+	set_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+
+	return 0;
+}
+
+/*
+ * fsverity op that ends enabling verity.
+ *
+ * @filp:              file we are finishing enabling verity on
+ * @desc:              verity descriptor to write out (NULL in error conditions)
+ * @desc_size:         size of the verity descriptor (variable with signatures)
+ * @merkle_tree_size:  size of the merkle tree in bytes
+ *
+ * If desc is null, then VFS is signaling an error occurred during verity
+ * enable, and we should try to rollback. Otherwise, attempt to finish verity.
+ *
+ * Returns 0 on success, negative error code on error.
+ */
+static int btrfs_end_enable_verity(struct file *filp, const void *desc,
+				   size_t desc_size, u64 merkle_tree_size)
+{
+	struct btrfs_inode *inode = BTRFS_I(file_inode(filp));
+	int ret = 0;
+	int rollback_ret;
+
+	ASSERT(inode_is_locked(file_inode(filp)));
+
+	if (desc == NULL)
+		goto rollback;
+
+	ret = finish_verity(inode, desc, desc_size);
+	if (ret)
+		goto rollback;
+	return ret;
+
+rollback:
+	rollback_ret = rollback_verity(inode);
+	if (rollback_ret)
+		btrfs_err(inode->root->fs_info,
+			  "failed to rollback verity items: %d", rollback_ret);
+	return ret;
+}
+
+/*
+ * fsverity op that gets the struct fsverity_descriptor.
+ *
+ * @inode:     inode to get the descriptor of
+ * @buf:       output buffer for the descriptor contents
+ * @buf_size:  size of the output buffer. 0 to query the size
+ *
+ * fsverity does a two pass setup for reading the descriptor, in the first pass
+ * it calls with buf_size = 0 to query the size of the descriptor, and then in
+ * the second pass it actually reads the descriptor off disk.
+ *
+ * Returns the size on success or a negative error code on failure.
+ */
+static int btrfs_get_verity_descriptor(struct inode *inode, void *buf,
+				       size_t buf_size)
+{
+	u64 true_size;
+	int ret = 0;
+	struct btrfs_verity_descriptor_item item;
+
+	memset(&item, 0, sizeof(item));
+	ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 0,
+			     (char *)&item, sizeof(item), NULL);
+	if (ret < 0)
+		return ret;
+
+	if (item.reserved[0] != 0 || item.reserved[1] != 0)
+		return -EUCLEAN;
+
+	true_size = btrfs_stack_verity_descriptor_size(&item);
+	if (true_size > INT_MAX)
+		return -EUCLEAN;
+
+	if (buf_size == 0)
+		return true_size;
+	if (buf_size < true_size)
+		return -ERANGE;
+
+	ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 1,
+			     buf, buf_size, NULL);
+	if (ret < 0)
+		return ret;
+	if (ret != true_size)
+		return -EIO;
+
+	return true_size;
+}
+
+/*
+ * fsverity op that reads and caches a merkle tree page.
+ *
+ * @inode:         inode to read a merkle tree page for
+ * @index:         page index relative to the start of the merkle tree
+ * @num_ra_pages:  number of pages to readahead. Optional, we ignore it
+ *
+ * The Merkle tree is stored in the filesystem btree, but its pages are cached
+ * with a logical position past EOF in the inode's mapping.
+ *
+ * Returns the page we read, or an ERR_PTR on error.
+ */
+static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
+						pgoff_t index,
+						unsigned long num_ra_pages)
+{
+	struct page *page;
+	u64 off = (u64)index << PAGE_SHIFT;
+	loff_t merkle_pos = merkle_file_pos(inode);
+	int ret;
+
+	if (merkle_pos < 0)
+		return ERR_PTR(merkle_pos);
+	if (merkle_pos > inode->i_sb->s_maxbytes - off - PAGE_SIZE)
+		return ERR_PTR(-EFBIG);
+	index += merkle_pos >> PAGE_SHIFT;
+again:
+	page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
+	if (page) {
+		if (PageUptodate(page))
+			return page;
+
+		lock_page(page);
+		/*
+		 * We only insert uptodate pages, so !Uptodate has to be
+		 * an error
+		 */
+		if (!PageUptodate(page)) {
+			unlock_page(page);
+			put_page(page);
+			return ERR_PTR(-EIO);
+		}
+		unlock_page(page);
+		return page;
+	}
+
+	page = __page_cache_alloc(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
+	if (!page)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * Merkle item keys are indexed from byte 0 in the merkle tree.
+	 * They have the form:
+	 *
+	 * [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ]
+	 */
+	ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off,
+			     page_address(page), PAGE_SIZE, page);
+	if (ret < 0) {
+		put_page(page);
+		return ERR_PTR(ret);
+	}
+	if (ret < PAGE_SIZE)
+		memzero_page(page, ret, PAGE_SIZE - ret);
+
+	SetPageUptodate(page);
+	ret = add_to_page_cache_lru(page, inode->i_mapping, index, GFP_NOFS);
+
+	if (!ret) {
+		/* Inserted and ready for fsverity */
+		unlock_page(page);
+	} else {
+		put_page(page);
+		/* Did someone race us into inserting this page? */
+		if (ret == -EEXIST)
+			goto again;
+		page = ERR_PTR(ret);
+	}
+	return page;
+}
+
+/*
+ * fsverity op that writes a Merkle tree block into the btree.
+ *
+ * @inode:          inode to write a Merkle tree block for
+ * @buf:            Merkle tree data block to write
+ * @index:          index of the block in the Merkle tree
+ * @log_blocksize:  log base 2 of the Merkle tree block size
+ *
+ * Note that the block size could be different from the page size, so it is not
+ * safe to assume that index is a page index.
+ *
+ * Returns 0 on success or negative error code on failure
+ */
+static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf,
+					u64 index, int log_blocksize)
+{
+	u64 off = index << log_blocksize;
+	u64 len = 1ULL << log_blocksize;
+	loff_t merkle_pos = merkle_file_pos(inode);
+
+	if (merkle_pos < 0)
+		return merkle_pos;
+	if (merkle_pos > inode->i_sb->s_maxbytes - off - len)
+		return -EFBIG;
+
+	return write_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY,
+			       off, buf, len);
+}
+
+const struct fsverity_operations btrfs_verityops = {
+	.begin_enable_verity     = btrfs_begin_enable_verity,
+	.end_enable_verity       = btrfs_end_enable_verity,
+	.get_verity_descriptor   = btrfs_get_verity_descriptor,
+	.read_merkle_tree_page   = btrfs_read_merkle_tree_page,
+	.write_merkle_tree_block = btrfs_write_merkle_tree_block,
+};
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 22cd037123fa..d7d3cfead056 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -288,6 +288,7 @@ struct btrfs_ioctl_fs_info_args {
  * first mount when booting older kernel versions.
  */
 #define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID	(1ULL << 1)
+#define BTRFS_FEATURE_COMPAT_RO_VERITY			(1ULL << 2)
 
 #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF	(1ULL << 0)
 #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(1ULL << 1)
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index ccdb40fe40dc..e1c4c732aaba 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -118,6 +118,29 @@
 #define BTRFS_INODE_REF_KEY		12
 #define BTRFS_INODE_EXTREF_KEY		13
 #define BTRFS_XATTR_ITEM_KEY		24
+
+/*
+ * fs verity items are stored under two different key types on disk.
+ * The descriptor items:
+ * [ inode objectid, BTRFS_VERITY_DESC_ITEM_KEY, offset ]
+ *
+ * At offset 0, we store a btrfs_verity_descriptor_item which tracks the size
+ * of the descriptor item and some extra data for encryption.
+ * Starting at offset 1, these hold the generic fs verity descriptor.  The
+ * latter are opaque to btrfs, we just read and write them as a blob for the
+ * higher level verity code.  The most common descriptor size is 256 bytes.
+ *
+ * The merkle tree items:
+ * [ inode objectid, BTRFS_VERITY_MERKLE_ITEM_KEY, offset ]
+ *
+ * These also start at offset 0, and correspond to the merkle tree bytes.  When
+ * fsverity asks for page 0 of the merkle tree, we pull up one page starting at
+ * offset 0 for this key type.  These are also opaque to btrfs, we're blindly
+ * storing whatever fsverity sends down.
+ */
+#define BTRFS_VERITY_DESC_ITEM_KEY	36
+#define BTRFS_VERITY_MERKLE_ITEM_KEY	37
+
 #define BTRFS_ORPHAN_ITEM_KEY		48
 /* reserve 2-15 close to the inode for later flexibility */
 
@@ -991,4 +1014,16 @@ struct btrfs_qgroup_limit_item {
 	__le64 rsv_excl;
 } __attribute__ ((__packed__));
 
+struct btrfs_verity_descriptor_item {
+	/* Size of the verity descriptor in bytes */
+	__le64 size;
+	/*
+	 * When we implement support for fscrypt, we will need to encrypt the
+	 * Merkle tree for encrypted verity files. These 128 bits are for the
+	 * eventual storage of an fscrypt initialization vector.
+	 */
+	__le64 reserved[2];
+	__u8 encryption;
+} __attribute__ ((__packed__));
+
 #endif /* _BTRFS_CTREE_H_ */
-- 
cgit v1.2.3


From 705242538ff348874e642f2ce953e19702af411d Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Wed, 30 Jun 2021 13:01:50 -0700
Subject: btrfs: verity metadata orphan items

Writing out the verity data is too large of an operation to do in a
single transaction. If we are interrupted before we finish creating
fsverity metadata for a file, or fail to clean up already created
metadata after a failure, we could leak the verity items that we already
committed.

To address this issue, we use the orphan mechanism. When we start
enabling verity on a file, we also add an orphan item for that inode.
When we are finished, we delete the orphan. However, if we are
interrupted midway, the orphan will be present at mount and we can
cleanup the half-formed verity state.

There is a possible race with a normal unlink operation: if unlink and
verity run on the same file in parallel, it is possible for verity to
succeed and delete the still legitimate orphan added by unlink. Then, if
we are interrupted and mount in that state, we will never clean up the
inode properly. This is also possible for a file created with O_TMPFILE.
Check nlink==0 before deleting to avoid this race.

A final thing to note is that this is a resurrection of using orphans to
signal an operation besides "delete this inode". The old case was to
signal the need to do a truncate. That case still technically applies
for mounting very old file systems, so we need to take some care to not
clobber it. To that end, we just have to be careful that verity orphan
cleanup is a no-op for non-verity files.

Signed-off-by: Boris Burkov <boris@bur.io>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c  | 15 ++++++++--
 fs/btrfs/verity.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 93 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 766cd35be33d..bc41d6c8d8d8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3557,7 +3557,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 
 		/*
 		 * If we have an inode with links, there are a couple of
-		 * possibilities. Old kernels (before v3.12) used to create an
+		 * possibilities:
+		 *
+		 * 1. We were halfway through creating fsverity metadata for the
+		 * file. In that case, the orphan item represents incomplete
+		 * fsverity metadata which must be cleaned up with
+		 * btrfs_drop_verity_items and deleting the orphan item.
+
+		 * 2. Old kernels (before v3.12) used to create an
 		 * orphan item for truncate indicating that there were possibly
 		 * extent items past i_size that needed to be deleted. In v3.12,
 		 * truncate was changed to update i_size in sync with the extent
@@ -3575,8 +3582,12 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 		 * but either way, we can delete the orphan item.
 		 */
 		if (ret == -ENOENT || inode->i_nlink) {
-			if (!ret)
+			if (!ret) {
+				ret = btrfs_drop_verity_items(BTRFS_I(inode));
 				iput(inode);
+				if (ret)
+					goto out;
+			}
 			trans = btrfs_start_transaction(root, 1);
 			if (IS_ERR(trans)) {
 				ret = PTR_ERR(trans);
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index ac4c2ca45925..28d443d3ef93 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -49,6 +49,15 @@
  * So when fsverity asks for page 0 of the merkle tree, we pull up one page
  * starting at offset 0 for this key type.  These are also opaque to btrfs,
  * we're blindly storing whatever fsverity sends down.
+ *
+ * Another important consideration is the fact that the Merkle tree data scales
+ * linearly with the size of the file (with 4K pages/blocks and SHA-256, it's
+ * ~1/127th the size) so for large files, writing the tree can be a lengthy
+ * operation. For that reason, we guard the whole enable verity operation
+ * (between begin_enable_verity and end_enable_verity) with an orphan item.
+ * Again, because the data can be pretty large, it's quite possible that we
+ * could run out of space writing it, so we try our best to handle errors by
+ * stopping and rolling back rather than aborting the victim transaction.
  */
 
 #define MERKLE_START_ALIGN			65536
@@ -396,6 +405,39 @@ out:
 	return ret;
 }
 
+/*
+ * Delete an fsverity orphan
+ *
+ * @trans:  transaction to do the delete in
+ * @inode:  inode to orphan
+ *
+ * Capture verity orphan specific logic that is repeated in the couple places
+ * we delete verity orphans. Specifically, handling ENOENT and ignoring inodes
+ * with 0 links.
+ *
+ * Returns zero on success or a negative error code on failure.
+ */
+static int del_orphan(struct btrfs_trans_handle *trans, struct btrfs_inode *inode)
+{
+	struct btrfs_root *root = inode->root;
+	int ret;
+
+	/*
+	 * If the inode has no links, it is either already unlinked, or was
+	 * created with O_TMPFILE. In either case, it should have an orphan from
+	 * that other operation. Rather than reference count the orphans, we
+	 * simply ignore them here, because we only invoke the verity path in
+	 * the orphan logic when i_nlink is 1.
+	 */
+	if (!inode->vfs_inode.i_nlink)
+		return 0;
+
+	ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
+	if (ret == -ENOENT)
+		ret = 0;
+	return ret;
+}
+
 /*
  * Rollback in-progress verity if we encounter an error.
  *
@@ -424,8 +466,11 @@ static int rollback_verity(struct btrfs_inode *inode)
 		goto out;
 	}
 
-	/* 1 for updating the inode flag */
-	trans = btrfs_start_transaction(root, 1);
+	/*
+	 * 1 for updating the inode flag
+	 * 1 for deleting the orphan
+	 */
+	trans = btrfs_start_transaction(root, 2);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
 		btrfs_handle_fs_error(root->fs_info, ret,
@@ -440,6 +485,11 @@ static int rollback_verity(struct btrfs_inode *inode)
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
+	ret = del_orphan(trans, inode);
+	if (ret) {
+		btrfs_abort_transaction(trans, ret);
+		goto out;
+	}
 	btrfs_end_transaction(trans);
 out:
 	return ret;
@@ -457,6 +507,7 @@ out:
  *
  * - write out the descriptor items
  * - mark the inode with the verity flag
+ * - delete the orphan item
  * - mark the ro compat bit
  * - clear the in progress bit
  *
@@ -484,8 +535,11 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc,
 	if (ret)
 		goto out;
 
-	/* 1 for updating the inode flag */
-	trans = btrfs_start_transaction(root, 1);
+	/*
+	 * 1 for updating the inode flag
+	 * 1 for deleting the orphan
+	 */
+	trans = btrfs_start_transaction(root, 2);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
 		goto out;
@@ -493,6 +547,9 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc,
 	inode->ro_flags |= BTRFS_INODE_RO_VERITY;
 	btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
 	ret = btrfs_update_inode(trans, root, inode);
+	if (ret)
+		goto end_trans;
+	ret = del_orphan(trans, inode);
 	if (ret)
 		goto end_trans;
 	clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
@@ -509,14 +566,16 @@ out:
  *
  * @filp:  file to enable verity on
  *
- * Begin enabling fsverity for the file. We drop any existing verity items
- * and set the in progress bit.
+ * Begin enabling fsverity for the file. We drop any existing verity items, add
+ * an orphan and set the in progress bit.
  *
  * Returns 0 on success, negative error code on failure.
  */
 static int btrfs_begin_enable_verity(struct file *filp)
 {
 	struct btrfs_inode *inode = BTRFS_I(file_inode(filp));
+	struct btrfs_root *root = inode->root;
+	struct btrfs_trans_handle *trans;
 	int ret;
 
 	ASSERT(inode_is_locked(file_inode(filp)));
@@ -524,11 +583,25 @@ static int btrfs_begin_enable_verity(struct file *filp)
 	if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags))
 		return -EBUSY;
 
+	/*
+	 * This should almost never do anything, but theoretically, it's
+	 * possible that we failed to enable verity on a file, then were
+	 * interrupted or failed while rolling back, failed to cleanup the
+	 * orphan, and finally attempt to enable verity again.
+	 */
 	ret = btrfs_drop_verity_items(inode);
 	if (ret)
 		return ret;
 
-	set_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+	/* 1 for the orphan item */
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	ret = btrfs_orphan_add(trans, inode);
+	if (!ret)
+		set_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+	btrfs_end_transaction(trans);
 
 	return 0;
 }
-- 
cgit v1.2.3


From ea3dc7d2d1f524eb2a34b6501be38c82be5c7ff1 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 28 Jul 2021 18:10:39 +0200
Subject: btrfs: print if fsverity support is built in when loading module

As fsverity support depends on a config option, print that at module
load time like we do for similar features.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2bdc544b4c95..d444338db3c6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2566,6 +2566,11 @@ static void __init btrfs_print_mod_info(void)
 			", zoned=yes"
 #else
 			", zoned=no"
+#endif
+#ifdef CONFIG_FS_VERITY
+			", fsverity=yes"
+#else
+			", fsverity=no"
 #endif
 			;
 	pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options);
-- 
cgit v1.2.3


From 0ff40a910f5649dfacc4fb5daa7e73692196342d Mon Sep 17 00:00:00 2001
From: Marcos Paulo de Souza <mpdesouza@suse.com>
Date: Thu, 29 Jul 2021 05:22:16 -0300
Subject: btrfs: introduce btrfs_search_backwards function

It's a common practice to start a search using offset (u64)-1, which is
the u64 maximum value, meaning that we want the search_slot function to
be set in the last item with the same objectid and type.

Once we are in this position, it's a matter to start a search backwards
by calling btrfs_previous_item, which will check if we'll need to go to
a previous leaf and other necessary checks, only to be sure that we are
in last offset of the same object and type.

The new btrfs_search_backwards function does the all these steps when
necessary, and can be used to avoid code duplication.

Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c   | 21 +++++++++++++++++++++
 fs/btrfs/ctree.h   |  4 ++++
 fs/btrfs/ioctl.c   | 30 ++++++++----------------------
 fs/btrfs/super.c   | 26 ++++++--------------------
 fs/btrfs/volumes.c |  7 +------
 5 files changed, 40 insertions(+), 48 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 99b33a5b33c8..84627cbd5b5b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2101,6 +2101,27 @@ again:
 	return 0;
 }
 
+/*
+ * Execute search and call btrfs_previous_item to traverse backwards if the item
+ * was not found.
+ *
+ * Return 0 if found, 1 if not found and < 0 if error.
+ */
+int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
+			   struct btrfs_path *path)
+{
+	int ret;
+
+	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret > 0)
+		ret = btrfs_previous_item(root, path, key->objectid, key->type);
+
+	if (ret == 0)
+		btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
+
+	return ret;
+}
+
 /*
  * adjust the pointers going up the tree, starting at level
  * making sure the right key of each node is points to 'key'.
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f17be4b023cb..a898257ad2b5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2908,6 +2908,10 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
 			u64 time_seq);
+
+int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
+			   struct btrfs_path *path);
+
 static inline int btrfs_next_old_item(struct btrfs_root *root,
 				      struct btrfs_path *p, u64 time_seq)
 {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 85c8b5a87a6a..ba1dab6a5012 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2389,23 +2389,16 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
 	key.offset = (u64)-1;
 
 	while (1) {
-		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		ret = btrfs_search_backwards(root, &key, path);
 		if (ret < 0)
 			goto out;
 		else if (ret > 0) {
-			ret = btrfs_previous_item(root, path, dirid,
-						  BTRFS_INODE_REF_KEY);
-			if (ret < 0)
-				goto out;
-			else if (ret > 0) {
-				ret = -ENOENT;
-				goto out;
-			}
+			ret = -ENOENT;
+			goto out;
 		}
 
 		l = path->nodes[0];
 		slot = path->slots[0];
-		btrfs_item_key_to_cpu(l, &key, slot);
 
 		iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
 		len = btrfs_inode_ref_name_len(l, iref);
@@ -2480,23 +2473,16 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
 		key.type = BTRFS_INODE_REF_KEY;
 		key.offset = (u64)-1;
 		while (1) {
-			ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-			if (ret < 0) {
+			ret = btrfs_search_backwards(root, &key, path);
+			if (ret < 0)
+				goto out_put;
+			else if (ret > 0) {
+				ret = -ENOENT;
 				goto out_put;
-			} else if (ret > 0) {
-				ret = btrfs_previous_item(root, path, dirid,
-							  BTRFS_INODE_REF_KEY);
-				if (ret < 0) {
-					goto out_put;
-				} else if (ret > 0) {
-					ret = -ENOENT;
-					goto out_put;
-				}
 			}
 
 			leaf = path->nodes[0];
 			slot = path->slots[0];
-			btrfs_item_key_to_cpu(leaf, &key, slot);
 
 			iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref);
 			len = btrfs_inode_ref_name_len(leaf, iref);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d444338db3c6..409bee3e7587 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1201,21 +1201,14 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
 		key.type = BTRFS_ROOT_BACKREF_KEY;
 		key.offset = (u64)-1;
 
-		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		ret = btrfs_search_backwards(root, &key, path);
 		if (ret < 0) {
 			goto err;
 		} else if (ret > 0) {
-			ret = btrfs_previous_item(root, path, subvol_objectid,
-						  BTRFS_ROOT_BACKREF_KEY);
-			if (ret < 0) {
-				goto err;
-			} else if (ret > 0) {
-				ret = -ENOENT;
-				goto err;
-			}
+			ret = -ENOENT;
+			goto err;
 		}
 
-		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
 		subvol_objectid = key.offset;
 
 		root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -1248,21 +1241,14 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
 			key.type = BTRFS_INODE_REF_KEY;
 			key.offset = (u64)-1;
 
-			ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
+			ret = btrfs_search_backwards(fs_root, &key, path);
 			if (ret < 0) {
 				goto err;
 			} else if (ret > 0) {
-				ret = btrfs_previous_item(fs_root, path, dirid,
-							  BTRFS_INODE_REF_KEY);
-				if (ret < 0) {
-					goto err;
-				} else if (ret > 0) {
-					ret = -ENOENT;
-					goto err;
-				}
+				ret = -ENOENT;
+				goto err;
 			}
 
-			btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
 			dirid = key.offset;
 
 			inode_ref = btrfs_item_ptr(path->nodes[0],
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 230192d097c4..536e60c6ade3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1586,14 +1586,9 @@ again:
 	key.offset = search_start;
 	key.type = BTRFS_DEV_EXTENT_KEY;
 
-	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	ret = btrfs_search_backwards(root, &key, path);
 	if (ret < 0)
 		goto out;
-	if (ret > 0) {
-		ret = btrfs_previous_item(root, path, key.objectid, key.type);
-		if (ret < 0)
-			goto out;
-	}
 
 	while (1) {
 		l = path->nodes[0];
-- 
cgit v1.2.3


From 98caf9531e1de8104b45fabbab4b6c2f290068fa Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Tue, 27 Jul 2021 16:17:26 -0500
Subject: btrfs: allocate file_ra_state on stack in readahead_cache

Instead of allocating file_ra_state using kmalloc, allocate on stack.
sizeof(struct readahead) = 32 bytes.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 2131ae5b9ed7..8eeb65278ac0 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -344,19 +344,13 @@ fail:
 
 static void readahead_cache(struct inode *inode)
 {
-	struct file_ra_state *ra;
+	struct file_ra_state ra;
 	unsigned long last_index;
 
-	ra = kzalloc(sizeof(*ra), GFP_NOFS);
-	if (!ra)
-		return;
-
-	file_ra_state_init(ra, inode->i_mapping);
+	file_ra_state_init(&ra, inode->i_mapping);
 	last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
 
-	page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
-
-	kfree(ra);
+	page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index);
 }
 
 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
-- 
cgit v1.2.3


From 0afb603afc3e3dd15c99dd34d5e18b46f9f5c0e4 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Tue, 27 Jul 2021 16:17:29 -0500
Subject: btrfs: allocate btrfs_ioctl_quota_rescan_args on stack

Instead of using kmalloc() to allocate btrfs_ioctl_quota_rescan_args,
allocate btrfs_ioctl_quota_rescan_args on stack, the size is reasonably
small and ioctls are called in process context.

sizeof(btrfs_ioctl_quota_rescan_args) = 64

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ba1dab6a5012..3d0ae797cd8d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4403,25 +4403,20 @@ drop_write:
 static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
 						void __user *arg)
 {
-	struct btrfs_ioctl_quota_rescan_args *qsa;
+	struct btrfs_ioctl_quota_rescan_args qsa = {0};
 	int ret = 0;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	qsa = kzalloc(sizeof(*qsa), GFP_KERNEL);
-	if (!qsa)
-		return -ENOMEM;
-
 	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
-		qsa->flags = 1;
-		qsa->progress = fs_info->qgroup_rescan_progress.objectid;
+		qsa.flags = 1;
+		qsa.progress = fs_info->qgroup_rescan_progress.objectid;
 	}
 
-	if (copy_to_user(arg, qsa, sizeof(*qsa)))
+	if (copy_to_user(arg, &qsa, sizeof(qsa)))
 		ret = -EFAULT;
 
-	kfree(qsa);
 	return ret;
 }
 
-- 
cgit v1.2.3


From c853a5783ebe123847886d432354931874367292 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Tue, 27 Jul 2021 16:17:30 -0500
Subject: btrfs: allocate btrfs_ioctl_defrag_range_args on stack

Instead of using kmalloc() to allocate btrfs_ioctl_defrag_range_args,
allocate btrfs_ioctl_defrag_range_args on stack, the size is reasonably
small and ioctls are called in process context.

sizeof(btrfs_ioctl_defrag_range_args) = 48

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3d0ae797cd8d..d09eaa83b5d2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3096,7 +3096,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 {
 	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_ioctl_defrag_range_args *range;
+	struct btrfs_ioctl_defrag_range_args range = {0};
 	int ret;
 
 	ret = mnt_want_write_file(file);
@@ -3134,33 +3134,24 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 			goto out;
 		}
 
-		range = kzalloc(sizeof(*range), GFP_KERNEL);
-		if (!range) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
 		if (argp) {
-			if (copy_from_user(range, argp,
-					   sizeof(*range))) {
+			if (copy_from_user(&range, argp, sizeof(range))) {
 				ret = -EFAULT;
-				kfree(range);
 				goto out;
 			}
 			/* compression requires us to start the IO */
-			if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
-				range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
-				range->extent_thresh = (u32)-1;
+			if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+				range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
+				range.extent_thresh = (u32)-1;
 			}
 		} else {
 			/* the rest are all set to zero by kzalloc */
-			range->len = (u64)-1;
+			range.len = (u64)-1;
 		}
 		ret = btrfs_defrag_file(file_inode(file), file,
-					range, BTRFS_OLDEST_GENERATION, 0);
+					&range, BTRFS_OLDEST_GENERATION, 0);
 		if (ret > 0)
 			ret = 0;
-		kfree(range);
 		break;
 	default:
 		ret = -EINVAL;
-- 
cgit v1.2.3


From dce2815039061116c41da1db24b9282e6e5e1734 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Tue, 27 Jul 2021 16:17:31 -0500
Subject: btrfs: allocate backref_ctx on stack in find_extent_clone

Instead of using kmalloc() to allocate backref_ctx, allocate backref_ctx
on stack. The size is reasonably small.

sizeof(backref_ctx) = 48

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 75cff564dedf..72f9b865e847 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1307,7 +1307,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 	u64 flags = 0;
 	struct btrfs_file_extent_item *fi;
 	struct extent_buffer *eb = path->nodes[0];
-	struct backref_ctx *backref_ctx = NULL;
+	struct backref_ctx backref_ctx = {0};
 	struct clone_root *cur_clone_root;
 	struct btrfs_key found_key;
 	struct btrfs_path *tmp_path;
@@ -1322,12 +1322,6 @@ static int find_extent_clone(struct send_ctx *sctx,
 	/* We only use this path under the commit sem */
 	tmp_path->need_commit_sem = 0;
 
-	backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_KERNEL);
-	if (!backref_ctx) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
 	if (data_offset >= ino_size) {
 		/*
 		 * There may be extents that lie behind the file's size.
@@ -1392,12 +1386,12 @@ static int find_extent_clone(struct send_ctx *sctx,
 		cur_clone_root->found_refs = 0;
 	}
 
-	backref_ctx->sctx = sctx;
-	backref_ctx->found = 0;
-	backref_ctx->cur_objectid = ino;
-	backref_ctx->cur_offset = data_offset;
-	backref_ctx->found_itself = 0;
-	backref_ctx->extent_len = num_bytes;
+	backref_ctx.sctx = sctx;
+	backref_ctx.found = 0;
+	backref_ctx.cur_objectid = ino;
+	backref_ctx.cur_offset = data_offset;
+	backref_ctx.found_itself = 0;
+	backref_ctx.extent_len = num_bytes;
 
 	/*
 	 * The last extent of a file may be too large due to page alignment.
@@ -1405,7 +1399,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 	 * __iterate_backrefs work.
 	 */
 	if (data_offset + num_bytes >= ino_size)
-		backref_ctx->extent_len = ino_size - data_offset;
+		backref_ctx.extent_len = ino_size - data_offset;
 
 	/*
 	 * Now collect all backrefs.
@@ -1416,12 +1410,12 @@ static int find_extent_clone(struct send_ctx *sctx,
 		extent_item_pos = 0;
 	ret = iterate_extent_inodes(fs_info, found_key.objectid,
 				    extent_item_pos, 1, __iterate_backrefs,
-				    backref_ctx, false);
+				    &backref_ctx, false);
 
 	if (ret < 0)
 		goto out;
 
-	if (!backref_ctx->found_itself) {
+	if (!backref_ctx.found_itself) {
 		/* found a bug in backref code? */
 		ret = -EIO;
 		btrfs_err(fs_info,
@@ -1434,7 +1428,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 		    "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
 		    data_offset, ino, num_bytes, logical);
 
-	if (!backref_ctx->found)
+	if (!backref_ctx.found)
 		btrfs_debug(fs_info, "no clones found");
 
 	cur_clone_root = NULL;
@@ -1458,7 +1452,6 @@ static int find_extent_clone(struct send_ctx *sctx,
 
 out:
 	btrfs_free_path(tmp_path);
-	kfree(backref_ctx);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 1c167b87f4f9c89e33dcffb92a45e30f937f04d6 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 29 Jul 2021 15:28:34 +0100
Subject: btrfs: remove unnecessary NULL check for the new inode during rename
 exchange

At the very end of btrfs_rename_exchange(), in case an error happened, we
are checking if 'new_inode' is NULL, but that is not needed since during a
rename exchange, unlike regular renames, 'new_inode' can never be NULL,
and if it were, we would have a crashed much earlier when we dereference it
multiple times.

So remove the check because it is not necessary and because it is causing
static checkers to emit a warning. I probably introduced the check by
copy-pasting similar code from btrfs_rename(), where 'new_inode' can be
NULL, in commit 86e8aa0e772cab ("Btrfs: unpin logs if rename exchange
operation fails").

Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bc41d6c8d8d8..d8a1e58b4bc8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9523,8 +9523,7 @@ out_fail:
 		if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
 		    btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
 		    btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
-		    (new_inode &&
-		     btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
+		    btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))
 			btrfs_set_log_full_commit(trans);
 
 		if (root_log_pinned) {
-- 
cgit v1.2.3


From d135a5339611352047462ef5943aee3a1202aa37 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 29 Jul 2021 15:29:01 +0100
Subject: btrfs: remove no longer needed full sync flag check at inode_logged()

Now that we are checking if the inode's logged_trans is 0 to detect the
possibility of the inode having been evicted and reloaded, the test for
the full sync flag (BTRFS_INODE_NEEDS_FULL_SYNC) is no longer needed at
tree-log.c:inode_logged(). Its purpose was to detect the possibility
of a previous eviction as well, since when an inode is loaded the full
sync flag is always set on it (and only cleared after the inode is
logged).

So just remove the check and update the comment. The check for the inode's
logged_trans being 0 was added recently by the patch with the subject
"btrfs: eliminate some false positives when checking if inode was logged".

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 567adc3de11a..5debb8c3663c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3435,16 +3435,14 @@ static bool inode_logged(struct btrfs_trans_handle *trans,
 	/*
 	 * The inode's logged_trans is always 0 when we load it (because it is
 	 * not persisted in the inode item or elsewhere). So if it is 0, the
-	 * inode was last modified in the current transaction and has the
-	 * full_sync flag set, then the inode may have been logged before in
-	 * the current transaction, then evicted and loaded again in the current
-	 * transaction - or may have never been logged in the current transaction,
-	 * but since we can not be sure, we have to assume it was, otherwise our
-	 * callers can leave an inconsistent log.
+	 * inode was last modified in the current transaction then the inode may
+	 * have been logged before in the current transaction, then evicted and
+	 * loaded again in the current transaction - or may have never been logged
+	 * in the current transaction, but since we can not be sure, we have to
+	 * assume it was, otherwise our callers can leave an inconsistent log.
 	 */
 	if (inode->logged_trans == 0 &&
 	    inode->last_trans == trans->transid &&
-	    test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
 	    !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
 		return true;
 
-- 
cgit v1.2.3


From 1f295373022e84683bc5768caca46bdba3a376c1 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 29 Jul 2021 15:30:21 +0100
Subject: btrfs: update comment at log_conflicting_inodes()

A comment at log_conflicting_inodes() mentions that we check the inode's
logged_trans field instead of using btrfs_inode_in_log() because the field
last_log_commit is not updated when we log that an inode exists and the
inode has the full sync flag (BTRFS_INODE_NEEDS_FULL_SYNC) set. The part
about the full sync flag is not true anymore since commit 9acc8103ab594f
("btrfs: fix unpersisted i_size on fsync after expanding truncate"), so
update the comment to not mention that part anymore.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5debb8c3663c..1ce7e4480256 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5092,8 +5092,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
 		/*
 		 * Check the inode's logged_trans only instead of
 		 * btrfs_inode_in_log(). This is because the last_log_commit of
-		 * the inode is not updated when we only log that it exists and
-		 * it has the full sync bit set (see btrfs_log_inode()).
+		 * the inode is not updated when we only log that it exists (see
+		 * btrfs_log_inode()).
 		 */
 		if (BTRFS_I(inode)->logged_trans == trans->transid) {
 			spin_unlock(&BTRFS_I(inode)->lock);
-- 
cgit v1.2.3


From 77233c2d2ec95030afcaf9fd90e4bdd6125e5c15 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 9 Aug 2021 20:41:17 +0900
Subject: btrfs: zoned: allow disabling of zone auto reclaim

Automatically reclaiming dirty zones might not always be desired for all
workloads, especially as there are currently still some rough edges with
the relocation code on zoned filesystems.

Allow disabling zone auto reclaim on a per filesystem basis by writing 0
as the threshold value.

Reviewed-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.c | 7 ++++---
 fs/btrfs/sysfs.c            | 7 ++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 8eeb65278ac0..e91440bd0794 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2538,6 +2538,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	u64 offset = bytenr - block_group->start;
 	u64 to_free, to_unusable;
+	const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold);
 
 	spin_lock(&ctl->tree_lock);
 	if (!used)
@@ -2567,9 +2568,9 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
 	/* All the region is now unusable. Mark it as unused and reclaim */
 	if (block_group->zone_unusable == block_group->length) {
 		btrfs_mark_bg_unused(block_group);
-	} else if (block_group->zone_unusable >=
-		   div_factor_fine(block_group->length,
-				   fs_info->bg_reclaim_threshold)) {
+	} else if (bg_reclaim_threshold &&
+		   block_group->zone_unusable >=
+		   div_factor_fine(block_group->length, bg_reclaim_threshold)) {
 		btrfs_mark_bg_to_reclaim(block_group);
 	}
 
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index bfe5e27617b0..c1261309a817 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -984,7 +984,8 @@ static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj,
 	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
 	ssize_t ret;
 
-	ret = scnprintf(buf, PAGE_SIZE, "%d\n", fs_info->bg_reclaim_threshold);
+	ret = scnprintf(buf, PAGE_SIZE, "%d\n",
+			READ_ONCE(fs_info->bg_reclaim_threshold));
 
 	return ret;
 }
@@ -1001,10 +1002,10 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
 	if (ret)
 		return ret;
 
-	if (thresh <= 50 || thresh > 100)
+	if (thresh != 0 && (thresh <= 50 || thresh > 100))
 		return -EINVAL;
 
-	fs_info->bg_reclaim_threshold = thresh;
+	WRITE_ONCE(fs_info->bg_reclaim_threshold, thresh);
 
 	return len;
 }
-- 
cgit v1.2.3


From ba86dd9fe60e5853fbff96f2658212908b83f271 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Mon, 9 Aug 2021 13:32:30 +0900
Subject: btrfs: zoned: suppress reclaim error message on EAGAIN

btrfs_relocate_chunk() can fail with -EAGAIN when e.g. send operations are
running. The message can fail btrfs/187 and it's unnecessary because we
anyway add it back to the reclaim list.

btrfs_reclaim_bgs_work()
`-> btrfs_relocate_chunk()
    `-> btrfs_relocate_block_group()
        `-> reloc_chunk_start()
            `-> if (fs_info->send_in_progress)
                `-> return -EAGAIN

CC: stable@vger.kernel.org # 5.13+
Fixes: 18bb8bbf13c1 ("btrfs: zoned: automatically reclaim zones")
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index d5421ee0d366..a3b830b8410a 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1561,7 +1561,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
 				div64_u64(zone_unusable * 100, bg->length));
 		trace_btrfs_reclaim_block_group(bg);
 		ret = btrfs_relocate_chunk(fs_info, bg->start);
-		if (ret)
+		if (ret && ret != -EAGAIN)
 			btrfs_err(fs_info, "error relocating chunk %llu",
 				  bg->start);
 
-- 
cgit v1.2.3


From 0ae79c6fe70d5c5c645733b7ed39d5e6021d8c9a Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Mon, 9 Aug 2021 13:13:44 +0900
Subject: btrfs: zoned: fix block group alloc_offset calculation

alloc_offset is offset from the start of a block group and @offset is
actually an address in logical space. Thus, we need to consider
block_group->start when calculating them.

Fixes: 011b41bffa3d ("btrfs: zoned: advance allocation pointer after tree log node")
CC: stable@vger.kernel.org # 5.12+
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index e91440bd0794..da0eee7c9e5f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2647,8 +2647,11 @@ int btrfs_remove_free_space(struct btrfs_block_group *block_group,
 		 * btrfs_pin_extent_for_log_replay() when replaying the log.
 		 * Advance the pointer not to overwrite the tree-log nodes.
 		 */
-		if (block_group->alloc_offset < offset + bytes)
-			block_group->alloc_offset = offset + bytes;
+		if (block_group->start + block_group->alloc_offset <
+		    offset + bytes) {
+			block_group->alloc_offset =
+				offset + bytes - block_group->start;
+		}
 		return 0;
 	}
 
-- 
cgit v1.2.3


From 63fb5879db7ca94fefac12cf7a5a051cee889c12 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Mon, 9 Aug 2021 09:29:18 +0900
Subject: btrfs: zoned: add asserts on splitting extent_map

We call split_zoned_em() on an extent_map on submitting a bio for it. Thus,
we can assume the extent_map is PINNED, not LOGGING, and in the modified
list. Add ASSERT()s to ensure the extent_maps after the split also has the
proper flags set and are in the modified list.

Suggested-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d8a1e58b4bc8..09bd7c11c99b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2303,7 +2303,6 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
 	struct extent_map *split_mid = NULL;
 	struct extent_map *split_post = NULL;
 	int ret = 0;
-	int modified;
 	unsigned long flags;
 
 	/* Sanity check */
@@ -2333,11 +2332,12 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
 	ASSERT(em->len == len);
 	ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
 	ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
+	ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+	ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
+	ASSERT(!list_empty(&em->list));
 
 	flags = em->flags;
 	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
-	clear_bit(EXTENT_FLAG_LOGGING, &flags);
-	modified = !list_empty(&em->list);
 
 	/* First, replace the em with a new extent_map starting from * em->start */
 	split_pre->start = em->start;
@@ -2351,7 +2351,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
 	split_pre->compress_type = em->compress_type;
 	split_pre->generation = em->generation;
 
-	replace_extent_mapping(em_tree, em, split_pre, modified);
+	replace_extent_mapping(em_tree, em, split_pre, 1);
 
 	/*
 	 * Now we only have an extent_map at:
@@ -2371,7 +2371,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
 		split_mid->flags = flags;
 		split_mid->compress_type = em->compress_type;
 		split_mid->generation = em->generation;
-		add_extent_mapping(em_tree, split_mid, modified);
+		add_extent_mapping(em_tree, split_mid, 1);
 	}
 
 	if (post) {
@@ -2385,7 +2385,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
 		split_post->flags = flags;
 		split_post->compress_type = em->compress_type;
 		split_post->generation = em->generation;
-		add_extent_mapping(em_tree, split_post, modified);
+		add_extent_mapping(em_tree, split_post, 1);
 	}
 
 	/* Once for us */
-- 
cgit v1.2.3


From e4571b8c5e9ffa1e85c0c671995bd4dcc5c75091 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 6 Aug 2021 18:24:15 +0800
Subject: btrfs: fix NULL pointer dereference when deleting device by invalid
 id

[BUG]
It's easy to trigger NULL pointer dereference, just by removing a
non-existing device id:

 # mkfs.btrfs -f -m single -d single /dev/test/scratch1 \
				     /dev/test/scratch2
 # mount /dev/test/scratch1 /mnt/btrfs
 # btrfs device remove 3 /mnt/btrfs

Then we have the following kernel NULL pointer dereference:

 BUG: kernel NULL pointer dereference, address: 0000000000000000
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 0 P4D 0
 Oops: 0000 [#1] PREEMPT SMP NOPTI
 CPU: 9 PID: 649 Comm: btrfs Not tainted 5.14.0-rc3-custom+ #35
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
 RIP: 0010:btrfs_rm_device+0x4de/0x6b0 [btrfs]
  btrfs_ioctl+0x18bb/0x3190 [btrfs]
  ? lock_is_held_type+0xa5/0x120
  ? find_held_lock.constprop.0+0x2b/0x80
  ? do_user_addr_fault+0x201/0x6a0
  ? lock_release+0xd2/0x2d0
  ? __x64_sys_ioctl+0x83/0xb0
  __x64_sys_ioctl+0x83/0xb0
  do_syscall_64+0x3b/0x90
  entry_SYSCALL_64_after_hwframe+0x44/0xae

[CAUSE]
Commit a27a94c2b0c7 ("btrfs: Make btrfs_find_device_by_devspec return
btrfs_device directly") moves the "missing" device path check into
btrfs_rm_device().

But btrfs_rm_device() itself can have case where it only receives
@devid, with NULL as @device_path.

In that case, calling strcmp() on NULL will trigger the NULL pointer
dereference.

Before that commit, we handle the "missing" case inside
btrfs_find_device_by_devspec(), which will not check @device_path at all
if @devid is provided, thus no way to trigger the bug.

[FIX]
Before calling strcmp(), also make sure @device_path is not NULL.

Fixes: a27a94c2b0c7 ("btrfs: Make btrfs_find_device_by_devspec return btrfs_device directly")
CC: stable@vger.kernel.org # 5.4+
Reported-by: butt3rflyh4ck <butterflyhuangxx@gmail.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 536e60c6ade3..7fec0c68b744 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2074,7 +2074,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 
 	if (IS_ERR(device)) {
 		if (PTR_ERR(device) == -ENOENT &&
-		    strcmp(device_path, "missing") == 0)
+		    device_path && strcmp(device_path, "missing") == 0)
 			ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
 		else
 			ret = PTR_ERR(device);
-- 
cgit v1.2.3


From e7849e33cf5d785568b181e3c15236e32c7dfdb2 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Tue, 10 Aug 2021 21:55:59 +0800
Subject: btrfs: sysfs: document structures and their associated files

Sysfs file has grown big. It takes some time to locate the correct
struct attribute to add new files. Create a table and map the struct
attribute to its sysfs path.

Also, fix the comment about the debug sysfs path.  And add the comments
to the attributes instead of attribute group, where sysfs file names are
defined.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/sysfs.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 75 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index c1261309a817..25a6f587852b 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -22,6 +22,26 @@
 #include "block-group.h"
 #include "qgroup.h"
 
+/*
+ * Structure name                       Path
+ * --------------------------------------------------------------------------
+ * btrfs_supported_static_feature_attrs /sys/fs/btrfs/features
+ * btrfs_supported_feature_attrs	/sys/fs/btrfs/features and
+ *					/sys/fs/btrfs/<uuid>/features
+ * btrfs_attrs				/sys/fs/btrfs/<uuid>
+ * devid_attrs				/sys/fs/btrfs/<uuid>/devinfo/<devid>
+ * allocation_attrs			/sys/fs/btrfs/<uuid>/allocation
+ * qgroup_attrs				/sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid>
+ * space_info_attrs			/sys/fs/btrfs/<uuid>/allocation/<bg-type>
+ * raid_attrs				/sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile>
+ *
+ * When built with BTRFS_CONFIG_DEBUG:
+ *
+ * btrfs_debug_feature_attrs		/sys/fs/btrfs/debug
+ * btrfs_debug_mount_attrs		/sys/fs/btrfs/<uuid>/debug
+ * discard_debug_attrs			/sys/fs/btrfs/<uuid>/debug/discard
+ */
+
 struct btrfs_feature_attr {
 	struct kobj_attribute kobj_attr;
 	enum btrfs_feature_set feature_set;
@@ -271,6 +291,13 @@ BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
 BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
 #endif
 
+/*
+ * Features which depend on feature bits and may differ between each fs.
+ *
+ * /sys/fs/btrfs/features      - all available features implemeted by this version
+ * /sys/fs/btrfs/UUID/features - features of the fs which are enabled or
+ *                               can be changed on a mounted filesystem.
+ */
 static struct attribute *btrfs_supported_feature_attrs[] = {
 	BTRFS_FEAT_ATTR_PTR(mixed_backref),
 	BTRFS_FEAT_ATTR_PTR(default_subvol),
@@ -294,13 +321,6 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
 	NULL
 };
 
-/*
- * Features which depend on feature bits and may differ between each fs.
- *
- * /sys/fs/btrfs/features lists all available features of this kernel while
- * /sys/fs/btrfs/UUID/features shows features of the fs which are enabled or
- * can be changed online.
- */
 static const struct attribute_group btrfs_feature_attr_group = {
 	.name = "features",
 	.is_visible = btrfs_feature_visible,
@@ -384,6 +404,12 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
 BTRFS_ATTR(static_feature, supported_sectorsizes,
 	   supported_sectorsizes_show);
 
+/*
+ * Features which only depend on kernel version.
+ *
+ * These are listed in /sys/fs/btrfs/features along with
+ * btrfs_supported_feature_attrs.
+ */
 static struct attribute *btrfs_supported_static_feature_attrs[] = {
 	BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
 	BTRFS_ATTR_PTR(static_feature, supported_checksums),
@@ -393,12 +419,6 @@ static struct attribute *btrfs_supported_static_feature_attrs[] = {
 	NULL
 };
 
-/*
- * Features which only depend on kernel version.
- *
- * These are listed in /sys/fs/btrfs/features along with
- * btrfs_feature_attr_group
- */
 static const struct attribute_group btrfs_static_feature_attr_group = {
 	.name = "features",
 	.attrs = btrfs_supported_static_feature_attrs,
@@ -557,6 +577,11 @@ static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj,
 BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show,
 	      btrfs_discard_max_discard_size_store);
 
+/*
+ * Per-filesystem debugging of discard (when mounted with discard=async).
+ *
+ * Path: /sys/fs/btrfs/<uuid>/debug/discard/
+ */
 static const struct attribute *discard_debug_attrs[] = {
 	BTRFS_ATTR_PTR(discard, discardable_bytes),
 	BTRFS_ATTR_PTR(discard, discardable_extents),
@@ -570,15 +595,19 @@ static const struct attribute *discard_debug_attrs[] = {
 };
 
 /*
- * Runtime debugging exported via sysfs
+ * Per-filesystem runtime debugging exported via sysfs.
  *
- * /sys/fs/btrfs/debug - applies to module or all filesystems
- * /sys/fs/btrfs/UUID  - applies only to the given filesystem
+ * Path: /sys/fs/btrfs/UUID/debug/
  */
 static const struct attribute *btrfs_debug_mount_attrs[] = {
 	NULL,
 };
 
+/*
+ * Runtime debugging exported via sysfs, applies to all mounted filesystems.
+ *
+ * Path: /sys/fs/btrfs/debug
+ */
 static struct attribute *btrfs_debug_feature_attrs[] = {
 	NULL
 };
@@ -647,6 +676,11 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
 	return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
 }
 
+/*
+ * Allocation information about block group profiles.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile>/
+ */
 static struct attribute *raid_attrs[] = {
 	BTRFS_ATTR_PTR(raid, total_bytes),
 	BTRFS_ATTR_PTR(raid, used_bytes),
@@ -686,6 +720,11 @@ SPACE_INFO_ATTR(bytes_zone_unusable);
 SPACE_INFO_ATTR(disk_used);
 SPACE_INFO_ATTR(disk_total);
 
+/*
+ * Allocation information about block group types.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/allocation/<bg-type>/
+ */
 static struct attribute *space_info_attrs[] = {
 	BTRFS_ATTR_PTR(space_info, flags),
 	BTRFS_ATTR_PTR(space_info, total_bytes),
@@ -713,6 +752,11 @@ static struct kobj_type space_info_ktype = {
 	.default_groups = space_info_groups,
 };
 
+/*
+ * Allocation information about block groups.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/allocation/
+ */
 static const struct attribute *allocation_attrs[] = {
 	BTRFS_ATTR_PTR(allocation, global_rsv_reserved),
 	BTRFS_ATTR_PTR(allocation, global_rsv_size),
@@ -1012,6 +1056,11 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
 BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show,
 	      btrfs_bg_reclaim_threshold_store);
 
+/*
+ * Per-filesystem information and stats.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/
+ */
 static const struct attribute *btrfs_attrs[] = {
 	BTRFS_ATTR_PTR(, label),
 	BTRFS_ATTR_PTR(, nodesize),
@@ -1521,6 +1570,11 @@ static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj,
 }
 BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show);
 
+/*
+ * Information about one device.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/devinfo/<devid>/
+ */
 static struct attribute *devid_attrs[] = {
 	BTRFS_ATTR_PTR(devid, error_stats),
 	BTRFS_ATTR_PTR(devid, in_fs_metadata),
@@ -1810,6 +1864,11 @@ QGROUP_RSV_ATTR(data, BTRFS_QGROUP_RSV_DATA);
 QGROUP_RSV_ATTR(meta_pertrans, BTRFS_QGROUP_RSV_META_PERTRANS);
 QGROUP_RSV_ATTR(meta_prealloc, BTRFS_QGROUP_RSV_META_PREALLOC);
 
+/*
+ * Qgroup information.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid>/
+ */
 static struct attribute *qgroup_attrs[] = {
 	BTRFS_ATTR_PTR(qgroup, referenced),
 	BTRFS_ATTR_PTR(qgroup, exclusive),
-- 
cgit v1.2.3


From c2fd68b6b2b00f0a6280b5971028c10c8f0ba70f Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Tue, 27 Jul 2021 12:48:40 +0200
Subject: namei: add mapping aware lookup helper

Various filesystems rely on the lookup_one_len() helper to lookup a
single path component relative to a well-known starting point. Allow
such filesystems to support idmapped mounts by adding a version of this
helper to take the idmap into account when calling inode_permission().
This change is a required to let btrfs (and other filesystems) support
idmapped mounts.

Cc: Christoph Hellwig <hch@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/namei.c            | 43 +++++++++++++++++++++++++++++++++++++------
 include/linux/namei.h |  1 +
 2 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index bf6d8a738c59..902df46e7dd3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2575,8 +2575,9 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 }
 EXPORT_SYMBOL(vfs_path_lookup);
 
-static int lookup_one_len_common(const char *name, struct dentry *base,
-				 int len, struct qstr *this)
+static int lookup_one_common(struct user_namespace *mnt_userns,
+			     const char *name, struct dentry *base, int len,
+			     struct qstr *this)
 {
 	this->name = name;
 	this->len = len;
@@ -2604,7 +2605,7 @@ static int lookup_one_len_common(const char *name, struct dentry *base,
 			return err;
 	}
 
-	return inode_permission(&init_user_ns, base->d_inode, MAY_EXEC);
+	return inode_permission(mnt_userns, base->d_inode, MAY_EXEC);
 }
 
 /**
@@ -2628,7 +2629,7 @@ struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len
 
 	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
 
-	err = lookup_one_len_common(name, base, len, &this);
+	err = lookup_one_common(&init_user_ns, name, base, len, &this);
 	if (err)
 		return ERR_PTR(err);
 
@@ -2655,7 +2656,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 
 	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
 
-	err = lookup_one_len_common(name, base, len, &this);
+	err = lookup_one_common(&init_user_ns, name, base, len, &this);
 	if (err)
 		return ERR_PTR(err);
 
@@ -2664,6 +2665,36 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 }
 EXPORT_SYMBOL(lookup_one_len);
 
+/**
+ * lookup_one - filesystem helper to lookup single pathname component
+ * @mnt_userns:	user namespace of the mount the lookup is performed from
+ * @name:	pathname component to lookup
+ * @base:	base directory to lookup from
+ * @len:	maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * The caller must hold base->i_mutex.
+ */
+struct dentry *lookup_one(struct user_namespace *mnt_userns, const char *name,
+			  struct dentry *base, int len)
+{
+	struct dentry *dentry;
+	struct qstr this;
+	int err;
+
+	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
+
+	err = lookup_one_common(mnt_userns, name, base, len, &this);
+	if (err)
+		return ERR_PTR(err);
+
+	dentry = lookup_dcache(&this, base, 0);
+	return dentry ? dentry : __lookup_slow(&this, base, 0);
+}
+EXPORT_SYMBOL(lookup_one);
+
 /**
  * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
  * @name:	pathname component to lookup
@@ -2683,7 +2714,7 @@ struct dentry *lookup_one_len_unlocked(const char *name,
 	int err;
 	struct dentry *ret;
 
-	err = lookup_one_len_common(name, base, len, &this);
+	err = lookup_one_common(&init_user_ns, name, base, len, &this);
 	if (err)
 		return ERR_PTR(err);
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index be9a2b349ca7..e89329bb3134 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -68,6 +68,7 @@ extern struct dentry *try_lookup_one_len(const char *, struct dentry *, int);
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
 extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
 extern struct dentry *lookup_positive_unlocked(const char *, struct dentry *, int);
+struct dentry *lookup_one(struct user_namespace *, const char *, struct dentry *, int);
 
 extern int follow_down_one(struct path *);
 extern int follow_down(struct path *);
-- 
cgit v1.2.3


From b3b6f5b9225506abc2e8d1f393761a6e509b791f Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Tue, 27 Jul 2021 12:48:41 +0200
Subject: btrfs: handle idmaps in btrfs_new_inode()

Extend btrfs_new_inode() to take the idmapped mount into account when
initializing a new inode. This is just a matter of passing down the
mount's userns. The rest is taken care of in inode_init_owner(). This is
a preliminary patch to make the individual btrfs inode operations
idmapped mount aware.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 09bd7c11c99b..0f9017a207ab 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6422,6 +6422,7 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
 
 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
+				     struct user_namespace *mnt_userns,
 				     struct inode *dir,
 				     const char *name, int name_len,
 				     u64 ref_objectid, u64 objectid,
@@ -6531,7 +6532,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	if (ret != 0)
 		goto fail_unlock;
 
-	inode_init_owner(&init_user_ns, inode, dir, mode);
+	inode_init_owner(mnt_userns, inode, dir, mode);
 	inode_set_bytes(inode, 0);
 
 	inode->i_mtime = current_time(inode);
@@ -6716,9 +6717,9 @@ static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
 	if (err)
 		goto out_unlock;
 
-	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-			dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
-			mode, &index);
+	inode = btrfs_new_inode(trans, root, &init_user_ns, dir,
+			dentry->d_name.name, dentry->d_name.len,
+			btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		inode = NULL;
@@ -6780,9 +6781,9 @@ static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
 	if (err)
 		goto out_unlock;
 
-	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-			dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
-			mode, &index);
+	inode = btrfs_new_inode(trans, root, &init_user_ns, dir,
+			dentry->d_name.name, dentry->d_name.len,
+			btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		inode = NULL;
@@ -6925,8 +6926,9 @@ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
 	if (err)
 		goto out_fail;
 
-	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-			dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
+	inode = btrfs_new_inode(trans, root, &init_user_ns, dir,
+			dentry->d_name.name, dentry->d_name.len,
+			btrfs_ino(BTRFS_I(dir)), objectid,
 			S_IFDIR | mode, &index);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
@@ -9038,7 +9040,8 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 	if (err < 0)
 		return err;
 
-	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, ino, ino,
+	inode = btrfs_new_inode(trans, new_root, &init_user_ns, NULL, "..", 2,
+				ino, ino,
 				S_IFDIR | (~current_umask() & S_IRWXUGO),
 				&index);
 	if (IS_ERR(inode))
@@ -9559,7 +9562,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
 	if (ret)
 		return ret;
 
-	inode = btrfs_new_inode(trans, root, dir,
+	inode = btrfs_new_inode(trans, root, &init_user_ns, dir,
 				dentry->d_name.name,
 				dentry->d_name.len,
 				btrfs_ino(BTRFS_I(dir)),
@@ -10071,9 +10074,10 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
 	if (err)
 		goto out_unlock;
 
-	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-				dentry->d_name.len, btrfs_ino(BTRFS_I(dir)),
-				objectid, S_IFLNK|S_IRWXUGO, &index);
+	inode = btrfs_new_inode(trans, root, &init_user_ns, dir,
+				dentry->d_name.name, dentry->d_name.len,
+				btrfs_ino(BTRFS_I(dir)), objectid,
+				S_IFLNK | S_IRWXUGO, &index);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		inode = NULL;
@@ -10422,7 +10426,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
 	if (ret)
 		goto out;
 
-	inode = btrfs_new_inode(trans, root, dir, NULL, 0,
+	inode = btrfs_new_inode(trans, root, &init_user_ns, dir, NULL, 0,
 			btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
 	if (IS_ERR(inode)) {
 		ret = PTR_ERR(inode);
-- 
cgit v1.2.3


From ca07274c3da901c7daa2c9280679c282b7c03bef Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Tue, 27 Jul 2021 12:48:42 +0200
Subject: btrfs: allow idmapped rename inode op

Enable btrfs_rename() to handle idmapped mounts. This is just a matter
of passing down the mount's userns.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0f9017a207ab..04a37b750a17 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9550,6 +9550,7 @@ out_notrans:
 
 static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
+				     struct user_namespace *mnt_userns,
 				     struct inode *dir,
 				     struct dentry *dentry)
 {
@@ -9562,7 +9563,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
 	if (ret)
 		return ret;
 
-	inode = btrfs_new_inode(trans, root, &init_user_ns, dir,
+	inode = btrfs_new_inode(trans, root, mnt_userns, dir,
 				dentry->d_name.name,
 				dentry->d_name.len,
 				btrfs_ino(BTRFS_I(dir)),
@@ -9599,9 +9600,10 @@ out:
 	return ret;
 }
 
-static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-			   struct inode *new_dir, struct dentry *new_dentry,
-			   unsigned int flags)
+static int btrfs_rename(struct user_namespace *mnt_userns,
+			struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry,
+			unsigned int flags)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
 	struct btrfs_trans_handle *trans;
@@ -9791,8 +9793,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	}
 
 	if (flags & RENAME_WHITEOUT) {
-		ret = btrfs_whiteout_for_rename(trans, root, old_dir,
-						old_dentry);
+		ret = btrfs_whiteout_for_rename(trans, root, mnt_userns,
+						old_dir, old_dentry);
 
 		if (ret) {
 			btrfs_abort_transaction(trans, ret);
@@ -9842,7 +9844,8 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di
 		return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
 					  new_dentry);
 
-	return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
+	return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
+			    new_dentry, flags);
 }
 
 struct btrfs_delalloc_work {
-- 
cgit v1.2.3


From c020d2eaf1a84ba8611fe2a232f4951faa98e0e0 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Tue, 27 Jul 2021 12:48:43 +0200
Subject: btrfs: allow idmapped getattr inode op

Enable btrfs_getattr() to handle idmapped mounts. This is just a matter
of passing down the mount's userns.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 04a37b750a17..04a30cab799d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9288,7 +9288,7 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,
 				  STATX_ATTR_IMMUTABLE |
 				  STATX_ATTR_NODUMP);
 
-	generic_fillattr(&init_user_ns, inode, stat);
+	generic_fillattr(mnt_userns, inode, stat);
 	stat->dev = BTRFS_I(inode)->root->anon_dev;
 
 	spin_lock(&BTRFS_I(inode)->lock);
-- 
cgit v1.2.3


From 72105277dcfca69175cea713f5edda4132839e14 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Tue, 27 Jul 2021 12:48:44 +0200
Subject: btrfs: allow idmapped mknod inode op

Enable btrfs_mknod() to handle idmapped mounts. This is just a matter of
passing down the mount's userns.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 04a30cab799d..4a1cc106ab1f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6717,7 +6717,7 @@ static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
 	if (err)
 		goto out_unlock;
 
-	inode = btrfs_new_inode(trans, root, &init_user_ns, dir,
+	inode = btrfs_new_inode(trans, root, mnt_userns, dir,
 			dentry->d_name.name, dentry->d_name.len,
 			btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
 	if (IS_ERR(inode)) {
-- 
cgit v1.2.3


From e93ca491d03fda28db54a3d6ddc15f03a61364d7 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Tue, 27 Jul 2021 12:48:45 +0200
Subject: btrfs: allow idmapped create inode op

Enable btrfs_create() to handle idmapped mounts. This is just a matter
of passing down the mount's userns.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4a1cc106ab1f..a75661cd68ec 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6781,7 +6781,7 @@ static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
 	if (err)
 		goto out_unlock;
 
-	inode = btrfs_new_inode(trans, root, &init_user_ns, dir,
+	inode = btrfs_new_inode(trans, root, mnt_userns, dir,
 			dentry->d_name.name, dentry->d_name.len,
 			btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
 	if (IS_ERR(inode)) {
-- 
cgit v1.2.3


From b0b3e44d346c91dde3899d37eddf867b9b36ffdc Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Tue, 27 Jul 2021 12:48:46 +0200
Subject: btrfs: allow idmapped mkdir inode op

Enable btrfs_mkdir() to handle idmapped mounts. This is just a matter of
passing down the mount's userns.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a75661cd68ec..16bd56edb8e1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6926,7 +6926,7 @@ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
 	if (err)
 		goto out_fail;
 
-	inode = btrfs_new_inode(trans, root, &init_user_ns, dir,
+	inode = btrfs_new_inode(trans, root, mnt_userns, dir,
 			dentry->d_name.name, dentry->d_name.len,
 			btrfs_ino(BTRFS_I(dir)), objectid,
 			S_IFDIR | mode, &index);
-- 
cgit v1.2.3


From 5a0521086e5fc5eb51690d4fc63fd26fdb5ae881 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Tue, 27 Jul 2021 12:48:47 +0200
Subject: btrfs: allow idmapped symlink inode op

Enable btrfs_symlink() to handle idmapped mounts. This is just a matter
of passing down the mount's userns.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 16bd56edb8e1..0ffd48f9b685 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -10077,7 +10077,7 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
 	if (err)
 		goto out_unlock;
 
-	inode = btrfs_new_inode(trans, root, &init_user_ns, dir,
+	inode = btrfs_new_inode(trans, root, mnt_userns, dir,
 				dentry->d_name.name, dentry->d_name.len,
 				btrfs_ino(BTRFS_I(dir)), objectid,
 				S_IFLNK | S_IRWXUGO, &index);
-- 
cgit v1.2.3


From 98b6ab5fc0988242114a4f0e02ed225685d9cc2b Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Tue, 27 Jul 2021 12:48:48 +0200
Subject: btrfs: allow idmapped tmpfile inode op

Enable btrfs_tmpfile() to handle idmapped mounts. This is just a matter
of passing down the mount's userns.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0ffd48f9b685..c040d9925077 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -10429,7 +10429,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
 	if (ret)
 		goto out;
 
-	inode = btrfs_new_inode(trans, root, &init_user_ns, dir, NULL, 0,
+	inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0,
 			btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
 	if (IS_ERR(inode)) {
 		ret = PTR_ERR(inode);
-- 
cgit v1.2.3


From d4d09464614227a6cc9ae42cd1d761e4320e4ebc Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Tue, 27 Jul 2021 12:48:49 +0200
Subject: btrfs: allow idmapped setattr inode op

Enable btrfs_setattr() to handle idmapped mounts. This is just a matter
of passing down the mount's userns.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c040d9925077..88ac63f3b57b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5392,7 +5392,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
 	if (btrfs_root_readonly(root))
 		return -EROFS;
 
-	err = setattr_prepare(&init_user_ns, dentry, attr);
+	err = setattr_prepare(mnt_userns, dentry, attr);
 	if (err)
 		return err;
 
@@ -5403,13 +5403,12 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
 	}
 
 	if (attr->ia_valid) {
-		setattr_copy(&init_user_ns, inode, attr);
+		setattr_copy(mnt_userns, inode, attr);
 		inode_inc_iversion(inode);
 		err = btrfs_dirty_inode(inode);
 
 		if (!err && attr->ia_valid & ATTR_MODE)
-			err = posix_acl_chmod(&init_user_ns, inode,
-					      inode->i_mode);
+			err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
 	}
 
 	return err;
-- 
cgit v1.2.3


From 3bc71ba02cf5376b390289bef8c9f5d6049f1866 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Tue, 27 Jul 2021 12:48:50 +0200
Subject: btrfs: allow idmapped permission inode op

Enable btrfs_permission() to handle idmapped mounts. This is just a
matter of passing down the mount's userns.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 88ac63f3b57b..6ef2448a9937 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -10403,7 +10403,7 @@ static int btrfs_permission(struct user_namespace *mnt_userns,
 		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
 			return -EACCES;
 	}
-	return generic_permission(&init_user_ns, inode, mask);
+	return generic_permission(mnt_userns, inode, mask);
 }
 
 static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
-- 
cgit v1.2.3


From 5474bf400f16bd1f930627ea65b698bca09dcfc6 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Tue, 27 Jul 2021 12:48:51 +0200
Subject: btrfs: check whether fsgid/fsuid are mapped during subvolume creation

When a new subvolume is created btrfs currently doesn't check whether
the fsgid/fsuid of the caller actually have a mapping in the user
namespace attached to the filesystem. The VFS always checks this to make
sure that the caller's fsgid/fsuid can be represented on-disk. This is
most relevant for filesystems that can be mounted inside user namespaces
but it is in general a good hardening measure to prevent unrepresentable
gid/uid from being written to disk.

Since we want to support idmapped mounts for btrfs ioctls to create
subvolumes in follow-up patches this becomes important since we want to
make sure the fsgid/fsuid of the caller as mapped according to the
idmapped mount can be represented on-disk. Simply add the missing
fsuidgid_has_mapping() line from the VFS may_create() version to
btrfs_may_create().

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d09eaa83b5d2..3661d2ce8ef6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -877,6 +877,8 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
 		return -EEXIST;
 	if (IS_DEADDIR(dir))
 		return -ENOENT;
+	if (!fsuidgid_has_mapping(dir->i_sb, &init_user_ns))
+		return -EOVERFLOW;
 	return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
 }
 
-- 
cgit v1.2.3


From 4d4340c912ccc351da5578f73c68f1109dcc8e2d Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Tue, 27 Jul 2021 12:48:52 +0200
Subject: btrfs: allow idmapped SNAP_CREATE/SUBVOL_CREATE ioctls

Creating subvolumes and snapshots is one of the core features of btrfs
and is even available to unprivileged users. Make it possible to use
subvolume and snapshot creation on idmapped mounts. This is a fairly
straightforward operation since all the permission checking helpers are
already capable of handling idmapped mounts. So we just need to pass
down the mount's userns.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h |  3 ++-
 fs/btrfs/inode.c |  5 +++--
 fs/btrfs/ioctl.c | 47 +++++++++++++++++++++++++++--------------------
 3 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a898257ad2b5..f07c82fafa04 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3164,7 +3164,8 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
 			      struct extent_state **cached_state);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *new_root,
-			     struct btrfs_root *parent_root);
+			     struct btrfs_root *parent_root,
+			     struct user_namespace *mnt_userns);
  void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
 			       unsigned *bits);
 void btrfs_clear_delalloc_extent(struct inode *inode,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6ef2448a9937..2aa9646bce56 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9028,7 +9028,8 @@ out:
  */
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *new_root,
-			     struct btrfs_root *parent_root)
+			     struct btrfs_root *parent_root,
+			     struct user_namespace *mnt_userns)
 {
 	struct inode *inode;
 	int err;
@@ -9039,7 +9040,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 	if (err < 0)
 		return err;
 
-	inode = btrfs_new_inode(trans, new_root, &init_user_ns, NULL, "..", 2,
+	inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2,
 				ino, ino,
 				S_IFDIR | (~current_umask() & S_IRWXUGO),
 				&index);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3661d2ce8ef6..910b5142c8c5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -499,8 +499,8 @@ int __pure btrfs_is_empty_uuid(u8 *uuid)
 	return 1;
 }
 
-static noinline int create_subvol(struct inode *dir,
-				  struct dentry *dentry,
+static noinline int create_subvol(struct user_namespace *mnt_userns,
+				  struct inode *dir, struct dentry *dentry,
 				  const char *name, int namelen,
 				  struct btrfs_qgroup_inherit *inherit)
 {
@@ -645,7 +645,7 @@ static noinline int create_subvol(struct inode *dir,
 		goto fail;
 	}
 
-	ret = btrfs_create_subvol_root(trans, new_root, root);
+	ret = btrfs_create_subvol_root(trans, new_root, root, mnt_userns);
 	btrfs_put_root(new_root);
 	if (ret) {
 		/* We potentially lose an unused inode item here */
@@ -871,15 +871,16 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
 }
 
 /* copy of may_create in fs/namei.c() */
-static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
+static inline int btrfs_may_create(struct user_namespace *mnt_userns,
+				   struct inode *dir, struct dentry *child)
 {
 	if (d_really_is_positive(child))
 		return -EEXIST;
 	if (IS_DEADDIR(dir))
 		return -ENOENT;
-	if (!fsuidgid_has_mapping(dir->i_sb, &init_user_ns))
+	if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns))
 		return -EOVERFLOW;
-	return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
+	return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
 }
 
 /*
@@ -888,6 +889,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
  * inside this filesystem so it's quite a bit simpler.
  */
 static noinline int btrfs_mksubvol(const struct path *parent,
+				   struct user_namespace *mnt_userns,
 				   const char *name, int namelen,
 				   struct btrfs_root *snap_src,
 				   bool readonly,
@@ -902,12 +904,12 @@ static noinline int btrfs_mksubvol(const struct path *parent,
 	if (error == -EINTR)
 		return error;
 
-	dentry = lookup_one_len(name, parent->dentry, namelen);
+	dentry = lookup_one(mnt_userns, name, parent->dentry, namelen);
 	error = PTR_ERR(dentry);
 	if (IS_ERR(dentry))
 		goto out_unlock;
 
-	error = btrfs_may_create(dir, dentry);
+	error = btrfs_may_create(mnt_userns, dir, dentry);
 	if (error)
 		goto out_dput;
 
@@ -929,7 +931,7 @@ static noinline int btrfs_mksubvol(const struct path *parent,
 	if (snap_src)
 		error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
 	else
-		error = create_subvol(dir, dentry, name, namelen, inherit);
+		error = create_subvol(mnt_userns, dir, dentry, name, namelen, inherit);
 
 	if (!error)
 		fsnotify_mkdir(dir, dentry);
@@ -943,6 +945,7 @@ out_unlock:
 }
 
 static noinline int btrfs_mksnapshot(const struct path *parent,
+				   struct user_namespace *mnt_userns,
 				   const char *name, int namelen,
 				   struct btrfs_root *root,
 				   bool readonly,
@@ -972,7 +975,7 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
 
 	btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
 
-	ret = btrfs_mksubvol(parent, name, namelen,
+	ret = btrfs_mksubvol(parent, mnt_userns, name, namelen,
 			     root, readonly, inherit);
 out:
 	if (snapshot_force_cow)
@@ -1801,6 +1804,7 @@ out_drop:
 }
 
 static noinline int __btrfs_ioctl_snap_create(struct file *file,
+				struct user_namespace *mnt_userns,
 				const char *name, unsigned long fd, int subvol,
 				bool readonly,
 				struct btrfs_qgroup_inherit *inherit)
@@ -1828,8 +1832,8 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
 	}
 
 	if (subvol) {
-		ret = btrfs_mksubvol(&file->f_path, name, namelen,
-				     NULL, readonly, inherit);
+		ret = btrfs_mksubvol(&file->f_path, mnt_userns, name,
+				     namelen, NULL, readonly, inherit);
 	} else {
 		struct fd src = fdget(fd);
 		struct inode *src_inode;
@@ -1843,16 +1847,17 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
 			btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
 				   "Snapshot src from another FS");
 			ret = -EXDEV;
-		} else if (!inode_owner_or_capable(&init_user_ns, src_inode)) {
+		} else if (!inode_owner_or_capable(mnt_userns, src_inode)) {
 			/*
 			 * Subvolume creation is not restricted, but snapshots
 			 * are limited to own subvolumes only
 			 */
 			ret = -EPERM;
 		} else {
-			ret = btrfs_mksnapshot(&file->f_path, name, namelen,
-					     BTRFS_I(src_inode)->root,
-					     readonly, inherit);
+			ret = btrfs_mksnapshot(&file->f_path, mnt_userns,
+					       name, namelen,
+					       BTRFS_I(src_inode)->root,
+					       readonly, inherit);
 		}
 		fdput(src);
 	}
@@ -1876,8 +1881,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 		return PTR_ERR(vol_args);
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 
-	ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd,
-					subvol, false, NULL);
+	ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file),
+					vol_args->name, vol_args->fd, subvol,
+					false, NULL);
 
 	kfree(vol_args);
 	return ret;
@@ -1935,8 +1941,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
 		}
 	}
 
-	ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd,
-					subvol, readonly, inherit);
+	ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file),
+					vol_args->name, vol_args->fd, subvol,
+					readonly, inherit);
 	if (ret)
 		goto free_inherit;
 free_inherit:
-- 
cgit v1.2.3


From c4ed533bdc7960873ab0258a4d18569061b4b0b4 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Tue, 27 Jul 2021 12:48:53 +0200
Subject: btrfs: allow idmapped SNAP_DESTROY ioctls

Destroying subvolumes and snapshots are important features of btrfs.
Both operations are available to unprivileged users if the filesystem
has been mounted with the "user_subvol_rm_allowed" mount option. Allow
subvolume and snapshot deletion on idmapped mounts. This is a fairly
straightforward operation since all the permission checking helpers are
already capable of handling idmapped mounts. So we just need to pass
down the mount's userns.

Subvolumes and snapshots can either be deleted by specifying their name
or - if BTRFS_IOC_SNAP_DESTROY_V2 is used - by their subvolume or
snapshot id if the BTRFS_SUBVOL_SPEC_BY_ID is set.

This feature is blocked on idmapped mounts as this allows filesystem
wide subvolume deletions and thus can escape the scope of what's exposed
under the mount identified by the fd passed with the ioctl.

This means that even the root or CAP_SYS_ADMIN capable user can't delete
a subvolume via BTRFS_SUBVOL_SPEC_BY_ID. This is intentional.

The root user is currently already subject to permission checks in
btrfs_may_delete() including whether the inode's i_uid/i_gid of the
directory the subvolume is located in have a mapping in the caller's
idmapping. For this to fail isn't currently possible since a btrfs
filesystem can't be mounted with a non-initial idmapping but it shows
that even the root user would fail to delete a subvolume if the relevant
inode isn't mapped in their idmapping. The idmapped mount case is the
same in principle.

This isn't a huge problem a root user wanting to delete arbitrary
subvolumes can just always create another (even detached) mount without
an idmapping attached.

In addition, we will allow BTRFS_SUBVOL_SPEC_BY_ID for cases where the
subvolume to delete is directly located under inode referenced by the fd
passed for the ioctl() in a follow-up commit.

Here is an example where a btrfs subvolume is deleted through a
subvolume mount that does not expose the subvolume to be delete but it
can still be deleted by using the subvolume id:

  /* Compile the following program as "delete_by_spec". */

  #define _GNU_SOURCE
  #include <fcntl.h>
  #include <inttypes.h>
  #include <linux/btrfs.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <sys/ioctl.h>
  #include <sys/stat.h>
  #include <sys/types.h>
  #include <unistd.h>

  static int rm_subvolume_by_id(int fd, uint64_t subvolid)
  {
	 struct btrfs_ioctl_vol_args_v2 args = {};
	 int ret;

	 args.flags = BTRFS_SUBVOL_SPEC_BY_ID;
	 args.subvolid = subvolid;

	 ret = ioctl(fd, BTRFS_IOC_SNAP_DESTROY_V2, &args);
	 if (ret < 0)
		 return -1;

	 return 0;
  }

  int main(int argc, char *argv[])
  {
	 int subvolid = 0;

	 if (argc < 3)
		 exit(1);

	 fprintf(stderr, "Opening %s\n", argv[1]);
	 int fd = open(argv[1], O_CLOEXEC | O_DIRECTORY);
	 if (fd < 0)
		 exit(2);

	 subvolid = atoi(argv[2]);

	 fprintf(stderr, "Deleting subvolume with subvolid %d\n", subvolid);
	 int ret = rm_subvolume_by_id(fd, subvolid);
	 if (ret < 0)
		 exit(3);

	 exit(0);
  }
  #include <stdio.h>"
  #include <stdlib.h>"
  #include <linux/btrfs.h"

  truncate -s 10G btrfs.img
  mkfs.btrfs btrfs.img
  export LOOPDEV=$(sudo losetup -f --show btrfs.img)
  mount ${LOOPDEV} /mnt
  sudo chown $(id -u):$(id -g) /mnt
  btrfs subvolume create /mnt/A
  btrfs subvolume create /mnt/B/C
  # Get subvolume id via:
  sudo btrfs subvolume show /mnt/A
  # Save subvolid
  SUBVOLID=<nr>
  sudo umount /mnt
  sudo mount ${LOOPDEV} -o subvol=B/C,user_subvol_rm_allowed /mnt
  ./delete_by_spec /mnt ${SUBVOLID}

With idmapped mounts this can potentially be used by users to delete
subvolumes/snapshots they would otherwise not have access to as the
idmapping would be applied to an inode that is not exposed in the mount
of the subvolume.

The fact that this is a filesystem wide operation suggests it might be a
good idea to expose this under a separate ioctl that clearly indicates
this. In essence, the file descriptor passed with the ioctl is merely
used to identify the filesystem on which to operate when
BTRFS_SUBVOL_SPEC_BY_ID is used.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 910b5142c8c5..6ec30e11ad22 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -837,7 +837,8 @@ free_pending:
  *     nfs_async_unlink().
  */
 
-static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
+static int btrfs_may_delete(struct user_namespace *mnt_userns,
+			    struct inode *dir, struct dentry *victim, int isdir)
 {
 	int error;
 
@@ -847,12 +848,12 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
 	BUG_ON(d_inode(victim->d_parent) != dir);
 	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
 
-	error = inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
+	error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
 	if (IS_APPEND(dir))
 		return -EPERM;
-	if (check_sticky(&init_user_ns, dir, d_inode(victim)) ||
+	if (check_sticky(mnt_userns, dir, d_inode(victim)) ||
 	    IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) ||
 	    IS_SWAPFILE(d_inode(victim)))
 		return -EPERM;
@@ -2907,6 +2908,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 	struct btrfs_root *dest = NULL;
 	struct btrfs_ioctl_vol_args *vol_args = NULL;
 	struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL;
+	struct user_namespace *mnt_userns = file_mnt_user_ns(file);
 	char *subvol_name, *subvol_name_ptr = NULL;
 	int subvol_namelen;
 	int err = 0;
@@ -2934,6 +2936,18 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 			if (err)
 				goto out;
 		} else {
+			/*
+			 * Deleting by subvolume id can be used to delete
+			 * subvolumes/snapshots anywhere in the filesystem.
+			 * Ensure that users can't abuse idmapped mounts of
+			 * btrfs subvolumes/snapshots to perform operations in
+			 * the whole filesystem.
+			 */
+			if (mnt_userns != &init_user_ns) {
+				err = -EOPNOTSUPP;
+				goto out;
+			}
+
 			if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) {
 				err = -EINVAL;
 				goto out;
@@ -3018,7 +3032,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 	err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
 	if (err == -EINTR)
 		goto free_subvol_name;
-	dentry = lookup_one_len(subvol_name, parent, subvol_namelen);
+	dentry = lookup_one(mnt_userns, subvol_name, parent, subvol_namelen);
 	if (IS_ERR(dentry)) {
 		err = PTR_ERR(dentry);
 		goto out_unlock_dir;
@@ -3060,14 +3074,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 		if (root == dest)
 			goto out_dput;
 
-		err = inode_permission(&init_user_ns, inode,
-				       MAY_WRITE | MAY_EXEC);
+		err = inode_permission(mnt_userns, inode, MAY_WRITE | MAY_EXEC);
 		if (err)
 			goto out_dput;
 	}
 
 	/* check if subvolume may be deleted by a user */
-	err = btrfs_may_delete(dir, dentry, 1);
+	err = btrfs_may_delete(mnt_userns, dir, dentry, 1);
 	if (err)
 		goto out_dput;
 
-- 
cgit v1.2.3


From aabb34e7a31c608dd7c00db9ad320e05941a39d0 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Tue, 27 Jul 2021 12:48:54 +0200
Subject: btrfs: relax restrictions for SNAP_DESTROY_V2 with subvolids

So far we prevented the deletion of subvolumes and snapshots using
subvolume ids possible with the BTRFS_SUBVOL_SPEC_BY_ID flag.

This restriction is necessary on idmapped mounts as this allows
filesystem wide subvolume and snapshot deletions and thus can escape the
scope of what's exposed under the mount identified by the fd passed with
the ioctl.

Deletion by subvolume id works by looking for an alias of the parent of
the subvolume or snapshot to be deleted. The parent alias can be
anywhere in the filesystem. However, as long as the alias of the parent
that is found is the same as the one identified by the file descriptor
passed through the ioctl we can allow the deletion.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6ec30e11ad22..63e7b2616302 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2936,17 +2936,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 			if (err)
 				goto out;
 		} else {
-			/*
-			 * Deleting by subvolume id can be used to delete
-			 * subvolumes/snapshots anywhere in the filesystem.
-			 * Ensure that users can't abuse idmapped mounts of
-			 * btrfs subvolumes/snapshots to perform operations in
-			 * the whole filesystem.
-			 */
-			if (mnt_userns != &init_user_ns) {
-				err = -EOPNOTSUPP;
-				goto out;
-			}
+			struct inode *old_dir;
 
 			if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) {
 				err = -EINVAL;
@@ -2984,6 +2974,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 				err = PTR_ERR(parent);
 				goto out_drop_write;
 			}
+			old_dir = dir;
 			dir = d_inode(parent);
 
 			/*
@@ -2994,6 +2985,20 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 			 */
 			destroy_parent = true;
 
+			/*
+			 * On idmapped mounts, deletion via subvolid is
+			 * restricted to subvolumes that are immediate
+			 * ancestors of the inode referenced by the file
+			 * descriptor in the ioctl. Otherwise the idmapping
+			 * could potentially be abused to delete subvolumes
+			 * anywhere in the filesystem the user wouldn't be able
+			 * to delete without an idmapped mount.
+			 */
+			if (old_dir != dir && mnt_userns != &init_user_ns) {
+				err = -EOPNOTSUPP;
+				goto free_parent;
+			}
+
 			subvol_name_ptr = btrfs_get_subvol_name_from_objectid(
 						fs_info, vol_args2->subvolid);
 			if (IS_ERR(subvol_name_ptr)) {
-- 
cgit v1.2.3


From e4fed17a32b6b1017ff2fb4cd73938abeeadd907 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Tue, 27 Jul 2021 12:48:55 +0200
Subject: btrfs: allow idmapped SET_RECEIVED_SUBVOL ioctls

The SET_RECEIVED_SUBVOL ioctls are used to set information about
a received subvolume. Make it possible to set information about a
received subvolume on idmapped mounts. This is a fairly straightforward
operation since all the permission checking helpers are already capable
of handling idmapped mounts. So we just need to pass down the mount's
userns.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 63e7b2616302..aa60a9578747 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4448,6 +4448,7 @@ static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
 }
 
 static long _btrfs_ioctl_set_received_subvol(struct file *file,
+					    struct user_namespace *mnt_userns,
 					    struct btrfs_ioctl_received_subvol_args *sa)
 {
 	struct inode *inode = file_inode(file);
@@ -4459,7 +4460,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
 	int ret = 0;
 	int received_uuid_changed;
 
-	if (!inode_owner_or_capable(&init_user_ns, inode))
+	if (!inode_owner_or_capable(mnt_userns, inode))
 		return -EPERM;
 
 	ret = mnt_want_write_file(file);
@@ -4564,7 +4565,7 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file,
 	args64->rtime.nsec = args32->rtime.nsec;
 	args64->flags = args32->flags;
 
-	ret = _btrfs_ioctl_set_received_subvol(file, args64);
+	ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), args64);
 	if (ret)
 		goto out;
 
@@ -4598,7 +4599,7 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
 	if (IS_ERR(sa))
 		return PTR_ERR(sa);
 
-	ret = _btrfs_ioctl_set_received_subvol(file, sa);
+	ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), sa);
 
 	if (ret)
 		goto out;
-- 
cgit v1.2.3


From 39e1674ff0351f6a47d3105e51bb1f9c72b3f20e Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Tue, 27 Jul 2021 12:48:56 +0200
Subject: btrfs: allow idmapped SUBVOL_SETFLAGS ioctl

Setting flags on subvolumes or snapshots are core features of btrfs. The
SUBVOL_SETFLAGS ioctl is especially important as it allows to make
subvolumes and snapshots read-only or read-write. Allow setting flags on
btrfs subvolumes and snapshots on idmapped mounts. This is a fairly
straightforward operation since all the permission checking helpers are
already capable of handling idmapped mounts. So we just need to pass
down the mount's userns.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index aa60a9578747..9d54149bad6e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1988,7 +1988,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
 	u64 flags;
 	int ret = 0;
 
-	if (!inode_owner_or_capable(&init_user_ns, inode))
+	if (!inode_owner_or_capable(file_mnt_user_ns(file), inode))
 		return -EPERM;
 
 	ret = mnt_want_write_file(file);
-- 
cgit v1.2.3


From 6623d9a0b0ce340d3e4dc4b18705ad212a49677a Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Tue, 27 Jul 2021 12:48:57 +0200
Subject: btrfs: allow idmapped INO_LOOKUP_USER ioctl

The INO_LOOKUP_USER is an unprivileged version of the INO_LOOKUP ioctl
and has the following restrictions. The main difference between the two
is that INO_LOOKUP is filesystem wide operation wheres INO_LOOKUP_USER
is scoped beneath the file descriptor passed with the ioctl.
Specifically, INO_LOOKUP_USER must adhere to the following restrictions:

- The caller must be privileged over each inode of each path component
  for the path they are trying to lookup.

- The path for the subvolume the caller is trying to lookup must be reachable
  from the inode associated with the file descriptor passed with the ioctl.

The second condition makes it possible to scope the lookup of the path
to the mount identified by the file descriptor passed with the ioctl.
This allows us to enable this ioctl on idmapped mounts.

Specifically, this is possible because all child subvolumes of a parent
subvolume are reachable when the parent subvolume is mounted. So if the
user had access to open the parent subvolume or has been given the fd
then they can lookup the path if they had access to it provided they
were privileged over each path component.

Note, the INO_LOOKUP_USER ioctl allows a user to learn the path and name
of a subvolume even though they would otherwise be restricted from doing
so via regular VFS-based lookup.

So think about a parent subvolume with multiple child subvolumes.
Someone could mount he parent subvolume and restrict access to the child
subvolumes by overmounting them with empty directories. At this point
the user can't traverse the child subvolumes and they can't open files
in the child subvolumes.  However, they can still learn the path of
child subvolumes as long as they have access to the parent subvolume by
using the INO_LOOKUP_USER ioctl.

The underlying assumption here is that it's ok that the lookup ioctls
can't really take mounts into account other than the original mount the
fd belongs to during lookup. Since this assumption is baked into the
original INO_LOOKUP_USER ioctl we can extend it to idmapped mounts.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9d54149bad6e..41524f9aeac3 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2439,7 +2439,8 @@ out:
 	return ret;
 }
 
-static int btrfs_search_path_in_tree_user(struct inode *inode,
+static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns,
+				struct inode *inode,
 				struct btrfs_ioctl_ino_lookup_user_args *args)
 {
 	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
@@ -2530,7 +2531,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
 				ret = PTR_ERR(temp_inode);
 				goto out_put;
 			}
-			ret = inode_permission(&init_user_ns, temp_inode,
+			ret = inode_permission(mnt_userns, temp_inode,
 					       MAY_READ | MAY_EXEC);
 			iput(temp_inode);
 			if (ret) {
@@ -2672,7 +2673,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
 		return -EACCES;
 	}
 
-	ret = btrfs_search_path_in_tree_user(inode, args);
+	ret = btrfs_search_path_in_tree_user(file_mnt_user_ns(file), inode, args);
 
 	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
 		ret = -EFAULT;
-- 
cgit v1.2.3


From 4a8b34afa9c94c180d16999e405d380cc0477369 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Tue, 27 Jul 2021 12:48:58 +0200
Subject: btrfs: handle ACLs on idmapped mounts

Make the ACL code idmapped mount aware. The POSIX default and POSIX
access ACLs are the only ACLs other than some specific xattrs that take
DAC permissions into account. On an idmapped mount they need to be
translated according to the mount's userns. The main change is done to
__btrfs_set_acl() which is responsible for translating POSIX ACLs to
their final on-disk representation.

The btrfs_init_acl() helper does not need to take the idmapped mount
into account since it is called in the context of file creation
operations (mknod, create, mkdir, symlink, tmpfile) and is used for
btrfs_init_inode_security() to copy POSIX default and POSIX access
permissions from the parent directory. These ACLs need to be inherited
unmodified from the parent directory. This is identical to what we do
for ext4 and xfs.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/acl.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index d95eb5c8cb37..c9f9789e828f 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -53,7 +53,8 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 }
 
 static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
-			 struct inode *inode, struct posix_acl *acl, int type)
+			   struct user_namespace *mnt_userns,
+			   struct inode *inode, struct posix_acl *acl, int type)
 {
 	int ret, size = 0;
 	const char *name;
@@ -114,12 +115,12 @@ int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
 	umode_t old_mode = inode->i_mode;
 
 	if (type == ACL_TYPE_ACCESS && acl) {
-		ret = posix_acl_update_mode(&init_user_ns, inode,
+		ret = posix_acl_update_mode(mnt_userns, inode,
 					    &inode->i_mode, &acl);
 		if (ret)
 			return ret;
 	}
-	ret = __btrfs_set_acl(NULL, inode, acl, type);
+	ret = __btrfs_set_acl(NULL, mnt_userns, inode, acl, type);
 	if (ret)
 		inode->i_mode = old_mode;
 	return ret;
@@ -140,14 +141,14 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
 		return ret;
 
 	if (default_acl) {
-		ret = __btrfs_set_acl(trans, inode, default_acl,
+		ret = __btrfs_set_acl(trans, &init_user_ns, inode, default_acl,
 				      ACL_TYPE_DEFAULT);
 		posix_acl_release(default_acl);
 	}
 
 	if (acl) {
 		if (!ret)
-			ret = __btrfs_set_acl(trans, inode, acl,
+			ret = __btrfs_set_acl(trans, &init_user_ns, inode, acl,
 					      ACL_TYPE_ACCESS);
 		posix_acl_release(acl);
 	}
-- 
cgit v1.2.3


From 5b9b26f5d0b88b74001dcfe4ab8a8f2f4e744112 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Tue, 27 Jul 2021 12:48:59 +0200
Subject: btrfs: allow idmapped mount

Now that we converted btrfs internally to account for idmapped mounts
allow the creation of idmapped mounts on by setting the FS_ALLOW_IDMAP
flag.  We only need to raise this flag on the btrfs_root_fs_type
filesystem since btrfs_mount_root() is ultimately responsible for
allocating the superblock and is called into from btrfs_mount()
associated with btrfs_fs_type.

The conversion of the btrfs inode operations was straightforward.
Regarding btrfs specific ioctls that perform checks based on inode
permissions only those have been allowed that are not filesystem wide
operations and hence can be reasonably charged against a specific mount.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 409bee3e7587..537d90bf5d84 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2362,7 +2362,7 @@ static struct file_system_type btrfs_root_fs_type = {
 	.name		= "btrfs",
 	.mount		= btrfs_mount_root,
 	.kill_sb	= btrfs_kill_super,
-	.fs_flags	= FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
+	.fs_flags	= FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
 };
 
 MODULE_ALIAS_FS("btrfs");
-- 
cgit v1.2.3


From 8be2ba2e0e11ade6ab96d8887dbb12abbd3540f4 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 29 Jul 2021 18:52:46 +0100
Subject: btrfs: avoid unnecessarily logging directories that had no changes

There are several cases where when logging an inode we need to log its
parent directories or logging subdirectories when logging a directory.

There are cases however where we end up logging a directory even if it was
not changed in the current transaction, no dentries added or removed since
the last transaction. While this is harmless from a functional point of
view, it is a waste time as it brings no advantage.

One example where this is triggered is the following:

  $ mkfs.btrfs -f /dev/sdc
  $ mount /dev/sdc /mnt

  $ mkdir /mnt/A
  $ mkdir /mnt/B
  $ mkdir /mnt/C

  $ touch /mnt/A/foo
  $ ln /mnt/A/foo /mnt/B/bar
  $ ln /mnt/A/foo /mnt/C/baz

  $ sync

  $ rm -f /mnt/A/foo
  $ xfs_io -c "fsync" /mnt/B/bar

This last fsync ends up logging directories A, B and C, however we only
need to log directory A, as B and C were not changed since the last
transaction commit.

So fix this by changing need_log_inode(), to return false in case the
given inode is a directory and has a ->last_trans value smaller than the
current transaction's ID.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1ce7e4480256..a1aaa1b8bd5c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5613,6 +5613,13 @@ out_unlock:
 static bool need_log_inode(struct btrfs_trans_handle *trans,
 			   struct btrfs_inode *inode)
 {
+	/*
+	 * If a directory was not modified, no dentries added or removed, we can
+	 * and should avoid logging it.
+	 */
+	if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
+		return false;
+
 	/*
 	 * If this inode does not have new/updated/deleted xattrs since the last
 	 * time it was logged and is flagged as logged in the current transaction,
-- 
cgit v1.2.3


From 3736127a3aa805602b7a2ad60ec9cfce68065fbb Mon Sep 17 00:00:00 2001
From: Marcos Paulo de Souza <mpdesouza@suse.com>
Date: Mon, 2 Aug 2021 09:34:00 -0300
Subject: btrfs: tree-log: check btrfs_lookup_data_extent return value

Function btrfs_lookup_data_extent calls btrfs_search_slot to verify if
the EXTENT_ITEM exists in the extent tree. btrfs_search_slot can return
values bellow zero if an error happened.

Function replay_one_extent currently checks if the search found
something (0 returned) and increments the reference, and if not, it
seems to evaluate as 'not found'.

Fix the condition by checking if the value was bellow zero and return
early.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a1aaa1b8bd5c..f7efc26aa82a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -753,7 +753,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 			 */
 			ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
 						ins.offset);
-			if (ret == 0) {
+			if (ret < 0) {
+				goto out;
+			} else if (ret == 0) {
 				btrfs_init_generic_ref(&ref,
 						BTRFS_ADD_DELAYED_REF,
 						ins.objectid, ins.offset, 0);
-- 
cgit v1.2.3


From 93c60b17f2b5fca2c5931d7944788d1ef5f25528 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 11 Aug 2021 14:37:15 -0400
Subject: btrfs: reduce the preemptive flushing threshold to 90%

The preemptive flushing code was added in order to avoid needing to
synchronously wait for ENOSPC flushing to recover space.  Once we're
almost full however we can essentially flush constantly.  We were using
98% as a threshold to determine if we were simply full, however in
practice this is a really high bar to hit.  For example reports of
systems running into this problem had around 94% usage and thus
continued to flush.  Fix this by lowering the threshold to 90%, which is
a more sane value, especially for smaller file systems.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=212185
CC: stable@vger.kernel.org # 5.12+
Fixes: 576fa34830af ("btrfs: improve preemptive background space flushing")
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/space-info.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index d9c8d738678f..cab532a48bcd 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -733,7 +733,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
 {
 	u64 global_rsv_size = fs_info->global_block_rsv.reserved;
 	u64 ordered, delalloc;
-	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
+	u64 thresh = div_factor_fine(space_info->total_bytes, 90);
 	u64 used;
 
 	/* If we're just plain full then async reclaim just slows us down. */
-- 
cgit v1.2.3


From 114623979405abf0b143f9c6688b3ff00ee48338 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 11 Aug 2021 14:37:16 -0400
Subject: btrfs: do not do preemptive flushing if the majority is global rsv

A common characteristic of the bug report where preemptive flushing was
going full tilt was the fact that the vast majority of the free metadata
space was used up by the global reserve.  The hard 90% threshold would
cover the majority of these cases, but to be even smarter we should take
into account how much of the outstanding reservations are covered by the
global block reserve.  If the global block reserve accounts for the vast
majority of outstanding reservations, skip preemptive flushing, as it
will likely just cause churn and pain.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=212185
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/space-info.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index cab532a48bcd..5ada02e0e629 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -741,6 +741,20 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
 	     global_rsv_size) >= thresh)
 		return false;
 
+	used = space_info->bytes_may_use + space_info->bytes_pinned;
+
+	/* The total flushable belongs to the global rsv, don't flush. */
+	if (global_rsv_size >= used)
+		return false;
+
+	/*
+	 * 128MiB is 1/4 of the maximum global rsv size.  If we have less than
+	 * that devoted to other reservations then there's no sense in flushing,
+	 * we don't have a lot of things that need flushing.
+	 */
+	if (used - global_rsv_size <= SZ_128M)
+		return false;
+
 	/*
 	 * We have tickets queued, bail so we don't compete with the async
 	 * flushers.
-- 
cgit v1.2.3


From 939c7feb19217c752a4b368d35aae1ed98f40b61 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Wed, 11 Aug 2021 15:37:08 +0900
Subject: btrfs: zoned: fix ordered extent boundary calculation

btrfs_lookup_ordered_extent() is supposed to query the offset in a file
instead of the logical address. Pass the file offset from
submit_extent_page() to calc_bio_boundaries().

Also, calc_bio_boundaries() relies on the bio's operation flag, so move
the call site after setting it.

Fixes: 390ed29b817e ("btrfs: refactor submit_extent_page() to make bio and its flag tracing easier")
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 96de6e70d06c..aaddd7225348 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3241,7 +3241,7 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
 }
 
 static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
-			       struct btrfs_inode *inode)
+			       struct btrfs_inode *inode, u64 file_offset)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct btrfs_io_geometry geom;
@@ -3283,7 +3283,7 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
 	}
 
 	/* Ordered extent not yet created, so we're good */
-	ordered = btrfs_lookup_ordered_extent(inode, logical);
+	ordered = btrfs_lookup_ordered_extent(inode, file_offset);
 	if (!ordered) {
 		bio_ctrl->len_to_oe_boundary = U32_MAX;
 		return 0;
@@ -3300,7 +3300,7 @@ static int alloc_new_bio(struct btrfs_inode *inode,
 			 struct writeback_control *wbc,
 			 unsigned int opf,
 			 bio_end_io_t end_io_func,
-			 u64 disk_bytenr, u32 offset,
+			 u64 disk_bytenr, u32 offset, u64 file_offset,
 			 unsigned long bio_flags)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -3317,13 +3317,13 @@ static int alloc_new_bio(struct btrfs_inode *inode,
 		bio = btrfs_bio_alloc(disk_bytenr + offset);
 	bio_ctrl->bio = bio;
 	bio_ctrl->bio_flags = bio_flags;
-	ret = calc_bio_boundaries(bio_ctrl, inode);
-	if (ret < 0)
-		goto error;
 	bio->bi_end_io = end_io_func;
 	bio->bi_private = &inode->io_tree;
 	bio->bi_write_hint = inode->vfs_inode.i_write_hint;
 	bio->bi_opf = opf;
+	ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
+	if (ret < 0)
+		goto error;
 	if (wbc) {
 		struct block_device *bdev;
 
@@ -3398,6 +3398,7 @@ static int submit_extent_page(unsigned int opf,
 		if (!bio_ctrl->bio) {
 			ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
 					    end_io_func, disk_bytenr, offset,
+					    page_offset(page) + cur,
 					    bio_flags);
 			if (ret < 0)
 				return ret;
-- 
cgit v1.2.3


From 0d977e0eba234e01a60bdde27314dc21374201b3 Mon Sep 17 00:00:00 2001
From: Desmond Cheong Zhi Xi <desmondcheongzx@gmail.com>
Date: Sat, 21 Aug 2021 01:50:40 +0800
Subject: btrfs: reset replace target device to allocation state on close

This crash was observed with a failed assertion on device close:

  BTRFS: Transaction aborted (error -28)
  WARNING: CPU: 1 PID: 3902 at fs/btrfs/extent-tree.c:2150 btrfs_run_delayed_refs+0x1d2/0x1e0 [btrfs]
  Modules linked in: btrfs blake2b_generic libcrc32c crc32c_intel xor zstd_decompress zstd_compress xxhash lzo_compress lzo_decompress raid6_pq loop
  CPU: 1 PID: 3902 Comm: kworker/u8:4 Not tainted 5.14.0-rc5-default+ #1532
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
  Workqueue: events_unbound btrfs_async_reclaim_metadata_space [btrfs]
  RIP: 0010:btrfs_run_delayed_refs+0x1d2/0x1e0 [btrfs]
  RSP: 0018:ffffb7a5452d7d80 EFLAGS: 00010282
  RAX: 0000000000000000 RBX: 0000000000000003 RCX: 0000000000000000
  RDX: 0000000000000001 RSI: ffffffffabee13c4 RDI: 00000000ffffffff
  RBP: ffff97834176a378 R08: 0000000000000001 R09: 0000000000000001
  R10: 0000000000000000 R11: 0000000000000001 R12: ffff97835195d388
  R13: 0000000005b08000 R14: ffff978385484000 R15: 000000000000016c
  FS:  0000000000000000(0000) GS:ffff9783bd800000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 000056190d003fe8 CR3: 000000002a81e005 CR4: 0000000000170ea0
  Call Trace:
   flush_space+0x197/0x2f0 [btrfs]
   btrfs_async_reclaim_metadata_space+0x139/0x300 [btrfs]
   process_one_work+0x262/0x5e0
   worker_thread+0x4c/0x320
   ? process_one_work+0x5e0/0x5e0
   kthread+0x144/0x170
   ? set_kthread_struct+0x40/0x40
   ret_from_fork+0x1f/0x30
  irq event stamp: 19334989
  hardirqs last  enabled at (19334997): [<ffffffffab0e0c87>] console_unlock+0x2b7/0x400
  hardirqs last disabled at (19335006): [<ffffffffab0e0d0d>] console_unlock+0x33d/0x400
  softirqs last  enabled at (19334900): [<ffffffffaba0030d>] __do_softirq+0x30d/0x574
  softirqs last disabled at (19334893): [<ffffffffab0721ec>] irq_exit_rcu+0x12c/0x140
  ---[ end trace 45939e308e0dd3c7 ]---
  BTRFS: error (device vdd) in btrfs_run_delayed_refs:2150: errno=-28 No space left
  BTRFS info (device vdd): forced readonly
  BTRFS warning (device vdd): failed setting block group ro: -30
  BTRFS info (device vdd): suspending dev_replace for unmount
  assertion failed: !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state), in fs/btrfs/volumes.c:1150
  ------------[ cut here ]------------
  kernel BUG at fs/btrfs/ctree.h:3431!
  invalid opcode: 0000 [#1] PREEMPT SMP
  CPU: 1 PID: 3982 Comm: umount Tainted: G        W         5.14.0-rc5-default+ #1532
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
  RIP: 0010:assertfail.constprop.0+0x18/0x1a [btrfs]
  RSP: 0018:ffffb7a5454c7db8 EFLAGS: 00010246
  RAX: 0000000000000068 RBX: ffff978364b91c00 RCX: 0000000000000000
  RDX: 0000000000000000 RSI: ffffffffabee13c4 RDI: 00000000ffffffff
  RBP: ffff9783523a4c00 R08: 0000000000000001 R09: 0000000000000001
  R10: 0000000000000000 R11: 0000000000000001 R12: ffff9783523a4d18
  R13: 0000000000000000 R14: 0000000000000004 R15: 0000000000000003
  FS:  00007f61c8f42800(0000) GS:ffff9783bd800000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 000056190cffa810 CR3: 0000000030b96002 CR4: 0000000000170ea0
  Call Trace:
   btrfs_close_one_device.cold+0x11/0x55 [btrfs]
   close_fs_devices+0x44/0xb0 [btrfs]
   btrfs_close_devices+0x48/0x160 [btrfs]
   generic_shutdown_super+0x69/0x100
   kill_anon_super+0x14/0x30
   btrfs_kill_super+0x12/0x20 [btrfs]
   deactivate_locked_super+0x2c/0xa0
   cleanup_mnt+0x144/0x1b0
   task_work_run+0x59/0xa0
   exit_to_user_mode_loop+0xe7/0xf0
   exit_to_user_mode_prepare+0xaf/0xf0
   syscall_exit_to_user_mode+0x19/0x50
   do_syscall_64+0x4a/0x90
   entry_SYSCALL_64_after_hwframe+0x44/0xae

This happens when close_ctree is called while a dev_replace hasn't
completed. In close_ctree, we suspend the dev_replace, but keep the
replace target around so that we can resume the dev_replace procedure
when we mount the root again. This is the call trace:

  close_ctree():
    btrfs_dev_replace_suspend_for_unmount();
    btrfs_close_devices():
      btrfs_close_fs_devices():
        btrfs_close_one_device():
          ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
                 &device->dev_state));

However, since the replace target sticks around, there is a device
with BTRFS_DEV_STATE_REPLACE_TGT set on close, and we fail the
assertion in btrfs_close_one_device.

To fix this, if we come across the replace target device when
closing, we should properly reset it back to allocation state. This
fix also ensures that if a non-target device has a corrupted state and
has the BTRFS_DEV_STATE_REPLACE_TGT bit set, the assertion will still
catch the error.

Reported-by: David Sterba <dsterba@suse.com>
Fixes: b2a616676839 ("btrfs: fix rw device counting in __btrfs_free_extra_devids")
CC: stable@vger.kernel.org # 4.19+
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Desmond Cheong Zhi Xi <desmondcheongzx@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7fec0c68b744..ec3a874165de 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1118,6 +1118,9 @@ static void btrfs_close_one_device(struct btrfs_device *device)
 		fs_devices->rw_devices--;
 	}
 
+	if (device->devid == BTRFS_DEV_REPLACE_DEVID)
+		clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
+
 	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
 		fs_devices->missing_devices--;
 
-- 
cgit v1.2.3