summaryrefslogtreecommitdiff
path: root/fs/btrfs/extent_io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r--fs/btrfs/extent_io.c737
1 files changed, 421 insertions, 316 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0dd24d128986..13bdd60da3c7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -198,9 +198,8 @@ static void __process_folios_contig(struct address_space *mapping,
u64 end, unsigned long page_ops)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
- pgoff_t start_index = start >> PAGE_SHIFT;
+ pgoff_t index = start >> PAGE_SHIFT;
pgoff_t end_index = end >> PAGE_SHIFT;
- pgoff_t index = start_index;
struct folio_batch fbatch;
int i;
@@ -221,7 +220,7 @@ static void __process_folios_contig(struct address_space *mapping,
}
}
-static noinline void __unlock_for_delalloc(const struct inode *inode,
+static noinline void unlock_delalloc_folio(const struct inode *inode,
const struct folio *locked_folio,
u64 start, u64 end)
{
@@ -242,9 +241,8 @@ static noinline int lock_delalloc_folios(struct inode *inode,
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct address_space *mapping = inode->i_mapping;
- pgoff_t start_index = start >> PAGE_SHIFT;
+ pgoff_t index = start >> PAGE_SHIFT;
pgoff_t end_index = end >> PAGE_SHIFT;
- pgoff_t index = start_index;
u64 processed_end = start;
struct folio_batch fbatch;
@@ -288,8 +286,7 @@ static noinline int lock_delalloc_folios(struct inode *inode,
out:
folio_batch_release(&fbatch);
if (processed_end > start)
- __unlock_for_delalloc(inode, locked_folio, start,
- processed_end);
+ unlock_delalloc_folio(inode, locked_folio, start, processed_end);
return -EAGAIN;
}
@@ -390,7 +387,7 @@ again:
unlock_extent(tree, delalloc_start, delalloc_end, &cached_state);
if (!ret) {
- __unlock_for_delalloc(inode, locked_folio, delalloc_start,
+ unlock_delalloc_folio(inode, locked_folio, delalloc_start,
delalloc_end);
cond_resched();
goto again;
@@ -428,14 +425,14 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
ASSERT(folio_pos(folio) <= start &&
- start + len <= folio_pos(folio) + PAGE_SIZE);
+ start + len <= folio_pos(folio) + folio_size(folio));
if (uptodate && btrfs_verify_folio(folio, start, len))
btrfs_folio_set_uptodate(fs_info, folio, start, len);
else
btrfs_folio_clear_uptodate(fs_info, folio, start, len);
- if (!btrfs_is_subpage(fs_info, folio->mapping))
+ if (!btrfs_is_subpage(fs_info, folio))
folio_unlock(folio);
else
btrfs_folio_end_lock(fs_info, folio, start, len);
@@ -491,11 +488,11 @@ static void end_bbio_data_write(struct btrfs_bio *bbio)
static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio)
{
ASSERT(folio_test_locked(folio));
- if (!btrfs_is_subpage(fs_info, folio->mapping))
+ if (!btrfs_is_subpage(fs_info, folio))
return;
ASSERT(folio_test_private(folio));
- btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE);
+ btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), folio_size(folio));
}
/*
@@ -526,8 +523,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
u64 end;
u32 len;
- /* For now only order 0 folios are supported for data. */
- ASSERT(folio_order(folio) == 0);
btrfs_debug(fs_info,
"%s: bi_sector=%llu, err=%d, mirror=%u",
__func__, bio->bi_iter.bi_sector, bio->bi_status,
@@ -555,7 +550,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
if (likely(uptodate)) {
loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> folio_shift(folio);
/*
* Zero out the remaining part if this range straddles
@@ -564,9 +558,11 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
* Here we should only zero the range inside the folio,
* not touch anything else.
*
- * NOTE: i_size is exclusive while end is inclusive.
+ * NOTE: i_size is exclusive while end is inclusive and
+ * folio_contains() takes PAGE_SIZE units.
*/
- if (folio_index(folio) == end_index && i_size <= end) {
+ if (folio_contains(folio, i_size >> PAGE_SHIFT) &&
+ i_size <= end) {
u32 zero_start = max(offset_in_folio(folio, i_size),
offset_in_folio(folio, start));
u32 zero_len = offset_in_folio(folio, end) + 1 -
@@ -632,7 +628,7 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
for (allocated = 0; allocated < nr_pages;) {
unsigned int last = allocated;
- allocated = alloc_pages_bulk_array(gfp, nr_pages, page_array);
+ allocated = alloc_pages_bulk(gfp, nr_pages, page_array);
if (unlikely(allocated == last)) {
/* No progress, fail and do cleanup. */
for (int i = 0; i < allocated; i++) {
@@ -710,6 +706,7 @@ static void alloc_new_bio(struct btrfs_inode *inode,
bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info,
bio_ctrl->end_io_func, NULL);
bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+ bbio->bio.bi_write_hint = inode->vfs_inode.i_write_hint;
bbio->inode = inode;
bbio->file_offset = file_offset;
bio_ctrl->bbio = bbio;
@@ -756,7 +753,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
{
struct btrfs_inode *inode = folio_to_inode(folio);
- ASSERT(pg_offset + size <= PAGE_SIZE);
+ ASSERT(pg_offset + size <= folio_size(folio));
ASSERT(bio_ctrl->end_io_func);
if (bio_ctrl->bbio &&
@@ -839,7 +836,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb,
if (folio->mapping)
lockdep_assert_held(&folio->mapping->i_private_lock);
- if (fs_info->nodesize >= PAGE_SIZE) {
+ if (!btrfs_meta_is_subpage(fs_info)) {
if (!folio_test_private(folio))
folio_attach_private(folio, eb);
else
@@ -862,11 +859,6 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb,
return ret;
}
-int set_page_extent_mapped(struct page *page)
-{
- return set_folio_extent_mapped(page_folio(page));
-}
-
int set_folio_extent_mapped(struct folio *folio)
{
struct btrfs_fs_info *fs_info;
@@ -878,7 +870,7 @@ int set_folio_extent_mapped(struct folio *folio)
fs_info = folio_to_fs_info(folio);
- if (btrfs_is_subpage(fs_info, folio->mapping))
+ if (btrfs_is_subpage(fs_info, folio))
return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA);
folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE);
@@ -895,8 +887,8 @@ void clear_folio_extent_mapped(struct folio *folio)
return;
fs_info = folio_to_fs_info(folio);
- if (btrfs_is_subpage(fs_info, folio->mapping))
- return btrfs_detach_subpage(fs_info, folio);
+ if (btrfs_is_subpage(fs_info, folio))
+ return btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA);
folio_detach_private(folio);
}
@@ -943,16 +935,12 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
u64 start = folio_pos(folio);
- const u64 end = start + PAGE_SIZE - 1;
- u64 cur = start;
+ const u64 end = start + folio_size(folio) - 1;
u64 extent_offset;
u64 last_byte = i_size_read(inode);
- u64 block_start;
struct extent_map *em;
int ret = 0;
- size_t pg_offset = 0;
- size_t iosize;
- size_t blocksize = fs_info->sectorsize;
+ const size_t blocksize = fs_info->sectorsize;
ret = set_folio_extent_mapped(folio);
if (ret < 0) {
@@ -960,28 +948,32 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
return ret;
}
- if (folio->index == last_byte >> folio_shift(folio)) {
+ if (folio_contains(folio, last_byte >> PAGE_SHIFT)) {
size_t zero_offset = offset_in_folio(folio, last_byte);
- if (zero_offset) {
- iosize = folio_size(folio) - zero_offset;
- folio_zero_range(folio, zero_offset, iosize);
- }
+ if (zero_offset)
+ folio_zero_range(folio, zero_offset,
+ folio_size(folio) - zero_offset);
}
bio_ctrl->end_io_func = end_bbio_data_read;
begin_folio_read(fs_info, folio);
- while (cur <= end) {
+ for (u64 cur = start; cur <= end; cur += blocksize) {
enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE;
+ unsigned long pg_offset = offset_in_folio(folio, cur);
bool force_bio_submit = false;
u64 disk_bytenr;
+ u64 block_start;
ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
if (cur >= last_byte) {
- iosize = folio_size(folio) - pg_offset;
- folio_zero_range(folio, pg_offset, iosize);
- end_folio_read(folio, true, cur, iosize);
+ folio_zero_range(folio, pg_offset, end - cur + 1);
+ end_folio_read(folio, true, cur, end - cur + 1);
break;
}
+ if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) {
+ end_folio_read(folio, true, cur, blocksize);
+ continue;
+ }
em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached);
if (IS_ERR(em)) {
end_folio_read(folio, false, cur, end + 1 - cur);
@@ -993,15 +985,15 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
compress_type = extent_map_compression(em);
- iosize = min(extent_map_end(em) - cur, end - cur + 1);
- iosize = ALIGN(iosize, blocksize);
if (compress_type != BTRFS_COMPRESS_NONE)
disk_bytenr = em->disk_bytenr;
else
disk_bytenr = extent_map_block_start(em) + extent_offset;
- block_start = extent_map_block_start(em);
+
if (em->flags & EXTENT_FLAG_PREALLOC)
block_start = EXTENT_MAP_HOLE;
+ else
+ block_start = extent_map_block_start(em);
/*
* If we have a file range that points to a compressed extent
@@ -1050,18 +1042,13 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
/* we've found a hole, just zero and go on */
if (block_start == EXTENT_MAP_HOLE) {
- folio_zero_range(folio, pg_offset, iosize);
-
- end_folio_read(folio, true, cur, iosize);
- cur = cur + iosize;
- pg_offset += iosize;
+ folio_zero_range(folio, pg_offset, blocksize);
+ end_folio_read(folio, true, cur, blocksize);
continue;
}
/* the get_extent function already copied into the folio */
if (block_start == EXTENT_MAP_INLINE) {
- end_folio_read(folio, true, cur, iosize);
- cur = cur + iosize;
- pg_offset += iosize;
+ end_folio_read(folio, true, cur, blocksize);
continue;
}
@@ -1072,15 +1059,190 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
if (force_bio_submit)
submit_one_bio(bio_ctrl);
- submit_extent_folio(bio_ctrl, disk_bytenr, folio, iosize,
+ submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize,
pg_offset);
- cur = cur + iosize;
- pg_offset += iosize;
}
-
return 0;
}
+/*
+ * Check if we can skip waiting the @ordered extent covering the block at @fileoff.
+ *
+ * @fileoff: Both input and output.
+ * Input as the file offset where the check should start at.
+ * Output as where the next check should start at,
+ * if the function returns true.
+ *
+ * Return true if we can skip to @fileoff. The caller needs to check the new
+ * @fileoff value to make sure it covers the full range, before skipping the
+ * full OE.
+ *
+ * Return false if we must wait for the ordered extent.
+ */
+static bool can_skip_one_ordered_range(struct btrfs_inode *inode,
+ struct btrfs_ordered_extent *ordered,
+ u64 *fileoff)
+{
+ const struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct folio *folio;
+ const u32 blocksize = fs_info->sectorsize;
+ u64 cur = *fileoff;
+ bool ret;
+
+ folio = filemap_get_folio(inode->vfs_inode.i_mapping, cur >> PAGE_SHIFT);
+
+ /*
+ * We should have locked the folio(s) for range [start, end], thus
+ * there must be a folio and it must be locked.
+ */
+ ASSERT(!IS_ERR(folio));
+ ASSERT(folio_test_locked(folio));
+
+ /*
+ * There are several cases for the folio and OE combination:
+ *
+ * 1) Folio has no private flag
+ * The OE has all its IO done but not yet finished, and folio got
+ * invalidated.
+ *
+ * Have we have to wait for the OE to finish, as it may contain the
+ * to-be-inserted data checksum.
+ * Without the data checksum inserted into the csum tree, read will
+ * just fail with missing csum.
+ */
+ if (!folio_test_private(folio)) {
+ ret = false;
+ goto out;
+ }
+
+ /*
+ * 2) The first block is DIRTY.
+ *
+ * This means the OE is created by some other folios whose file pos is
+ * before this one. And since we are holding the folio lock, the writeback
+ * of this folio cannot start.
+ *
+ * We must skip the whole OE, because it will never start until we
+ * finished our folio read and unlocked the folio.
+ */
+ if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) {
+ u64 range_len = min(folio_pos(folio) + folio_size(folio),
+ ordered->file_offset + ordered->num_bytes) - cur;
+
+ ret = true;
+ /*
+ * At least inside the folio, all the remaining blocks should
+ * also be dirty.
+ */
+ ASSERT(btrfs_folio_test_dirty(fs_info, folio, cur, range_len));
+ *fileoff = ordered->file_offset + ordered->num_bytes;
+ goto out;
+ }
+
+ /*
+ * 3) The first block is uptodate.
+ *
+ * At least the first block can be skipped, but we are still not fully
+ * sure. E.g. if the OE has some other folios in the range that cannot
+ * be skipped.
+ * So we return true and update @next_ret to the OE/folio boundary.
+ */
+ if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) {
+ u64 range_len = min(folio_pos(folio) + folio_size(folio),
+ ordered->file_offset + ordered->num_bytes) - cur;
+
+ /*
+ * The whole range to the OE end or folio boundary should also
+ * be uptodate.
+ */
+ ASSERT(btrfs_folio_test_uptodate(fs_info, folio, cur, range_len));
+ ret = true;
+ *fileoff = cur + range_len;
+ goto out;
+ }
+
+ /*
+ * 4) The first block is not uptodate.
+ *
+ * This means the folio is invalidated after the writeback was finished,
+ * but by some other operations (e.g. block aligned buffered write) the
+ * folio is inserted into filemap.
+ * Very much the same as case 1).
+ */
+ ret = false;
+out:
+ folio_put(folio);
+ return ret;
+}
+
+static bool can_skip_ordered_extent(struct btrfs_inode *inode,
+ struct btrfs_ordered_extent *ordered,
+ u64 start, u64 end)
+{
+ const u64 range_end = min(end, ordered->file_offset + ordered->num_bytes - 1);
+ u64 cur = max(start, ordered->file_offset);
+
+ while (cur < range_end) {
+ bool can_skip;
+
+ can_skip = can_skip_one_ordered_range(inode, ordered, &cur);
+ if (!can_skip)
+ return false;
+ }
+ return true;
+}
+
+/*
+ * Locking helper to make sure we get a stable view of extent maps for the
+ * involved range.
+ *
+ * This is for folio read paths (read and readahead), thus the involved range
+ * should have all the folios locked.
+ */
+static void lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end,
+ struct extent_state **cached_state)
+{
+ u64 cur_pos;
+
+ /* Caller must provide a valid @cached_state. */
+ ASSERT(cached_state);
+
+ /* The range must at least be page aligned, as all read paths are folio based. */
+ ASSERT(IS_ALIGNED(start, PAGE_SIZE));
+ ASSERT(IS_ALIGNED(end + 1, PAGE_SIZE));
+
+again:
+ lock_extent(&inode->io_tree, start, end, cached_state);
+ cur_pos = start;
+ while (cur_pos < end) {
+ struct btrfs_ordered_extent *ordered;
+
+ ordered = btrfs_lookup_ordered_range(inode, cur_pos,
+ end - cur_pos + 1);
+ /*
+ * No ordered extents in the range, and we hold the extent lock,
+ * no one can modify the extent maps in the range, we're safe to return.
+ */
+ if (!ordered)
+ break;
+
+ /* Check if we can skip waiting for the whole OE. */
+ if (can_skip_ordered_extent(inode, ordered, start, end)) {
+ cur_pos = min(ordered->file_offset + ordered->num_bytes,
+ end + 1);
+ btrfs_put_ordered_extent(ordered);
+ continue;
+ }
+
+ /* Now wait for the OE to finish. */
+ unlock_extent(&inode->io_tree, start, end, cached_state);
+ btrfs_start_ordered_extent_nowriteback(ordered, start, end + 1 - start);
+ btrfs_put_ordered_extent(ordered);
+ /* We have unlocked the whole range, restart from the beginning. */
+ goto again;
+ }
+}
+
int btrfs_read_folio(struct file *file, struct folio *folio)
{
struct btrfs_inode *inode = folio_to_inode(folio);
@@ -1091,7 +1253,7 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
struct extent_map *em_cached = NULL;
int ret;
- btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state);
+ lock_extents_for_read(inode, start, end, &cached_state);
ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL);
unlock_extent(&inode->io_tree, start, end, &cached_state);
@@ -1113,7 +1275,7 @@ static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bit
unsigned int start_bit;
unsigned int nbits;
- ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE);
+ ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio));
start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
nbits = len >> fs_info->sectorsize_bits;
ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits));
@@ -1126,12 +1288,12 @@ static bool find_next_delalloc_bitmap(struct folio *folio,
{
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
const u64 folio_start = folio_pos(folio);
- const unsigned int bitmap_size = fs_info->sectors_per_page;
+ const unsigned int bitmap_size = btrfs_blocks_per_folio(fs_info, folio);
unsigned int start_bit;
unsigned int first_zero;
unsigned int first_set;
- ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE);
+ ASSERT(start >= folio_start && start < folio_start + folio_size(folio));
start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit);
@@ -1165,9 +1327,10 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode);
struct writeback_control *wbc = bio_ctrl->wbc;
- const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping);
+ const bool is_subpage = btrfs_is_subpage(fs_info, folio);
const u64 page_start = folio_pos(folio);
const u64 page_end = page_start + folio_size(folio) - 1;
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
unsigned long delalloc_bitmap = 0;
/*
* Save the last found delalloc end. As the delalloc end can go beyond
@@ -1192,14 +1355,14 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
int bit;
/* Save the dirty bitmap as our submission bitmap will be a subset of it. */
- if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) {
- ASSERT(fs_info->sectors_per_page > 1);
+ if (btrfs_is_subpage(fs_info, folio)) {
+ ASSERT(blocks_per_folio > 1);
btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap);
} else {
bio_ctrl->submit_bitmap = 1;
}
- for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) {
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) {
u64 start = page_start + (bit << fs_info->sectorsize_bits);
btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize);
@@ -1266,6 +1429,15 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
wbc);
if (ret >= 0)
last_finished_delalloc_end = found_start + found_len;
+ if (unlikely(ret < 0))
+ btrfs_err_rl(fs_info,
+"failed to run delalloc range, root=%lld ino=%llu folio=%llu submit_bitmap=%*pbl start=%llu len=%u: %d",
+ btrfs_root_id(inode->root),
+ btrfs_ino(inode),
+ folio_pos(folio),
+ blocks_per_folio,
+ &bio_ctrl->submit_bitmap,
+ found_start, found_len, ret);
} else {
/*
* We've hit an error during previous delalloc range,
@@ -1273,7 +1445,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
*/
unlock_extent(&inode->io_tree, found_start,
found_start + found_len - 1, NULL);
- __unlock_for_delalloc(&inode->vfs_inode, folio,
+ unlock_delalloc_folio(&inode->vfs_inode, folio,
found_start,
found_start + found_len - 1);
}
@@ -1308,7 +1480,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
unsigned int bitmap_size = min(
(last_finished_delalloc_end - page_start) >>
fs_info->sectorsize_bits,
- fs_info->sectors_per_page);
+ blocks_per_folio);
for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size)
btrfs_mark_ordered_io_finished(inode, folio,
@@ -1323,7 +1495,7 @@ out:
delalloc_end = page_end;
/*
* delalloc_end is already one less than the total length, so
- * we don't subtract one from PAGE_SIZE
+ * we don't subtract one from PAGE_SIZE.
*/
delalloc_to_write +=
DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE);
@@ -1332,7 +1504,7 @@ out:
* If all ranges are submitted asynchronously, we just need to account
* for them here.
*/
- if (bitmap_empty(&bio_ctrl->submit_bitmap, fs_info->sectors_per_page)) {
+ if (bitmap_empty(&bio_ctrl->submit_bitmap, blocks_per_folio)) {
wbc->nr_to_write -= delalloc_to_write;
return 1;
}
@@ -1375,7 +1547,7 @@ static int submit_one_sector(struct btrfs_inode *inode,
em = btrfs_get_extent(inode, NULL, filepos, sectorsize);
if (IS_ERR(em))
- return PTR_ERR_OR_ZERO(em);
+ return PTR_ERR(em);
extent_offset = filepos - em->start;
em_end = extent_map_end(em);
@@ -1433,6 +1605,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
bool submitted_io = false;
bool error = false;
const u64 folio_start = folio_pos(folio);
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
u64 cur;
int bit;
int ret = 0;
@@ -1441,21 +1614,23 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
start + len <= folio_start + folio_size(folio));
ret = btrfs_writepage_cow_fixup(folio);
- if (ret) {
+ if (ret == -EAGAIN) {
/* Fixup worker will requeue */
folio_redirty_for_writepage(bio_ctrl->wbc, folio);
folio_unlock(folio);
return 1;
}
+ if (ret < 0)
+ return ret;
for (cur = start; cur < start + len; cur += fs_info->sectorsize)
set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap);
bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap,
- fs_info->sectors_per_page);
+ blocks_per_folio);
bio_ctrl->end_io_func = end_bbio_data_write;
- for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) {
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) {
cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits);
if (cur >= i_size) {
@@ -1529,6 +1704,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
size_t pg_offset;
loff_t i_size = i_size_read(&inode->vfs_inode);
unsigned long end_index = i_size >> PAGE_SHIFT;
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc);
@@ -1550,6 +1726,30 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
* The proper bitmap can only be initialized until writepage_delalloc().
*/
bio_ctrl->submit_bitmap = (unsigned long)-1;
+
+ /*
+ * If the page is dirty but without private set, it's marked dirty
+ * without informing the fs.
+ * Nowadays that is a bug, since the introduction of
+ * pin_user_pages*().
+ *
+ * So here we check if the page has private set to rule out such
+ * case.
+ * But we also have a long history of relying on the COW fixup,
+ * so here we only enable this check for experimental builds until
+ * we're sure it's safe.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) &&
+ unlikely(!folio_test_private(folio))) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ btrfs_err_rl(fs_info,
+ "root %lld ino %llu folio %llu is marked dirty without notifying the fs",
+ inode->root->root_key.objectid,
+ btrfs_ino(inode), folio_pos(folio));
+ ret = -EUCLEAN;
+ goto done;
+ }
+
ret = set_folio_extent_mapped(folio);
if (ret < 0)
goto done;
@@ -1561,9 +1761,15 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
goto done;
ret = extent_writepage_io(inode, folio, folio_pos(folio),
- PAGE_SIZE, bio_ctrl, i_size);
+ folio_size(folio), bio_ctrl, i_size);
if (ret == 1)
return 0;
+ if (ret < 0)
+ btrfs_err_rl(fs_info,
+"failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d",
+ btrfs_root_id(inode->root), btrfs_ino(inode),
+ folio_pos(folio), blocks_per_folio,
+ &bio_ctrl->submit_bitmap, ret);
bio_ctrl->wbc->nr_to_write--;
@@ -1579,12 +1785,6 @@ done:
return ret;
}
-void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
-{
- wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
- TASK_UNINTERRUPTIBLE);
-}
-
/*
* Lock extent buffer status and pages for writeback.
*
@@ -1724,21 +1924,13 @@ static struct extent_buffer *find_extent_buffer_nolock(
static void end_bbio_meta_write(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
- struct btrfs_fs_info *fs_info = eb->fs_info;
- bool uptodate = !bbio->bio.bi_status;
struct folio_iter fi;
- u32 bio_offset = 0;
- if (!uptodate)
+ if (bbio->bio.bi_status != BLK_STS_OK)
set_btree_ioerr(eb);
bio_for_each_folio_all(fi, &bbio->bio) {
- u64 start = eb->start + bio_offset;
- struct folio *folio = fi.folio;
- u32 len = fi.length;
-
- btrfs_folio_clear_writeback(fs_info, folio, start, len);
- bio_offset += len;
+ btrfs_meta_folio_clear_writeback(fi.folio, eb);
}
clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
@@ -1792,38 +1984,21 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
wbc_init_bio(wbc, &bbio->bio);
bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
bbio->file_offset = eb->start;
- if (fs_info->nodesize < PAGE_SIZE) {
- struct folio *folio = eb->folios[0];
- bool ret;
+ for (int i = 0; i < num_extent_folios(eb); i++) {
+ struct folio *folio = eb->folios[i];
+ u64 range_start = max_t(u64, eb->start, folio_pos(folio));
+ u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio),
+ eb->start + eb->len) - range_start;
folio_lock(folio);
- btrfs_subpage_set_writeback(fs_info, folio, eb->start, eb->len);
- if (btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start,
- eb->len)) {
- folio_clear_dirty_for_io(folio);
- wbc->nr_to_write--;
- }
- ret = bio_add_folio(&bbio->bio, folio, eb->len,
- eb->start - folio_pos(folio));
- ASSERT(ret);
- wbc_account_cgroup_owner(wbc, folio, eb->len);
- folio_unlock(folio);
- } else {
- int num_folios = num_extent_folios(eb);
-
- for (int i = 0; i < num_folios; i++) {
- struct folio *folio = eb->folios[i];
- bool ret;
-
- folio_lock(folio);
- folio_clear_dirty_for_io(folio);
- folio_start_writeback(folio);
- ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
- ASSERT(ret);
- wbc_account_cgroup_owner(wbc, folio, eb->folio_size);
+ btrfs_meta_folio_clear_dirty(folio, eb);
+ btrfs_meta_folio_set_writeback(folio, eb);
+ if (!folio_test_dirty(folio))
wbc->nr_to_write -= folio_nr_pages(folio);
- folio_unlock(folio);
- }
+ bio_add_folio_nofail(&bbio->bio, folio, range_len,
+ offset_in_folio(folio, range_start));
+ wbc_account_cgroup_owner(wbc, folio, range_len);
+ folio_unlock(folio);
}
btrfs_submit_bbio(bbio, 0);
}
@@ -1849,9 +2024,10 @@ static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc)
u64 folio_start = folio_pos(folio);
int bit_start = 0;
int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
/* Lock and write each dirty extent buffers in the range */
- while (bit_start < fs_info->sectors_per_page) {
+ while (bit_start < blocks_per_folio) {
struct btrfs_subpage *subpage = folio_get_private(folio);
struct extent_buffer *eb;
unsigned long flags;
@@ -1867,11 +2043,11 @@ static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc)
break;
}
spin_lock_irqsave(&subpage->lock, flags);
- if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * fs_info->sectors_per_page,
+ if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * blocks_per_folio,
subpage->bitmaps)) {
spin_unlock_irqrestore(&subpage->lock, flags);
spin_unlock(&folio->mapping->i_private_lock);
- bit_start++;
+ bit_start += sectors_per_node;
continue;
}
@@ -1933,7 +2109,7 @@ static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ct
if (!folio_test_private(folio))
return 0;
- if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE)
+ if (btrfs_meta_is_subpage(folio_to_fs_info(folio)))
return submit_eb_subpage(folio, wbc);
spin_lock(&mapping->i_private_lock);
@@ -2192,10 +2368,8 @@ retry:
done_index = folio_next_index(folio);
/*
* At this point we hold neither the i_pages lock nor
- * the page lock: the page may be truncated or
- * invalidated (changing page->mapping to NULL),
- * or even swizzled back from swapper_space to
- * tmpfs file mapping
+ * the folio lock: the folio may be truncated or
+ * invalidated (changing folio->mapping to NULL).
*/
if (!folio_trylock(folio)) {
submit_write_bio(bio_ctrl, 0);
@@ -2233,7 +2407,7 @@ retry:
* regular submission.
*/
if (wbc->sync_mode != WB_SYNC_NONE ||
- btrfs_is_subpage(inode_to_fs_info(inode), mapping)) {
+ btrfs_is_subpage(inode_to_fs_info(inode), folio)) {
if (folio_test_writeback(folio))
submit_write_bio(bio_ctrl, 0);
folio_wait_writeback(folio);
@@ -2314,8 +2488,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
while (cur <= end) {
- u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
- u32 cur_len = cur_end + 1 - cur;
+ u64 cur_end;
+ u32 cur_len;
struct folio *folio;
folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT);
@@ -2325,13 +2499,18 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
* code is just in case, but shouldn't actually be run.
*/
if (IS_ERR(folio)) {
+ cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
+ cur_len = cur_end + 1 - cur;
btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL,
cur, cur_len, false);
mapping_set_error(mapping, PTR_ERR(folio));
- cur = cur_end + 1;
+ cur = cur_end;
continue;
}
+ cur_end = min_t(u64, folio_pos(folio) + folio_size(folio) - 1, end);
+ cur_len = cur_end + 1 - cur;
+
ASSERT(folio_test_locked(folio));
if (pages_dirty && folio != locked_folio)
ASSERT(folio_test_dirty(folio));
@@ -2390,7 +2569,7 @@ void btrfs_readahead(struct readahead_control *rac)
struct extent_map *em_cached = NULL;
u64 prev_em_start = (u64)-1;
- btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state);
+ lock_extents_for_read(inode, start, end, &cached_state);
while ((folio = readahead_folio(rac)) != NULL)
btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start);
@@ -2443,7 +2622,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree,
struct folio *folio)
{
u64 start = folio_pos(folio);
- u64 end = start + PAGE_SIZE - 1;
+ u64 end = start + folio_size(folio) - 1;
bool ret;
if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) {
@@ -2481,7 +2660,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree,
bool try_release_extent_mapping(struct folio *folio, gfp_t mask)
{
u64 start = folio_pos(folio);
- u64 end = start + PAGE_SIZE - 1;
+ u64 end = start + folio_size(folio) - 1;
struct btrfs_inode *inode = folio_to_inode(folio);
struct extent_io_tree *io_tree = &inode->io_tree;
@@ -2554,11 +2733,6 @@ next:
return try_release_extent_state(io_tree, folio);
}
-static void __free_extent_buffer(struct extent_buffer *eb)
-{
- kmem_cache_free(extent_buffer_cache, eb);
-}
-
static int extent_buffer_under_io(const struct extent_buffer *eb)
{
return (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
@@ -2597,7 +2771,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
return;
}
- if (fs_info->nodesize >= PAGE_SIZE) {
+ if (!btrfs_meta_is_subpage(fs_info)) {
/*
* We do this since we'll remove the pages after we've
* removed the eb from the radix tree, so we could race
@@ -2623,7 +2797,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
* attached to one dummy eb, no sharing.
*/
if (!mapped) {
- btrfs_detach_subpage(fs_info, folio);
+ btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA);
return;
}
@@ -2634,13 +2808,13 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
* page range and no unfinished IO.
*/
if (!folio_range_has_eb(folio))
- btrfs_detach_subpage(fs_info, folio);
+ btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA);
spin_unlock(&folio->mapping->i_private_lock);
}
-/* Release all pages attached to the extent buffer */
-static void btrfs_release_extent_buffer_pages(const struct extent_buffer *eb)
+/* Release all folios attached to the extent buffer */
+static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb)
{
ASSERT(!extent_buffer_under_io(eb));
@@ -2662,20 +2836,19 @@ static void btrfs_release_extent_buffer_pages(const struct extent_buffer *eb)
*/
static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
{
- btrfs_release_extent_buffer_pages(eb);
+ btrfs_release_extent_buffer_folios(eb);
btrfs_leak_debug_del_eb(eb);
- __free_extent_buffer(eb);
+ kmem_cache_free(extent_buffer_cache, eb);
}
-static struct extent_buffer *
-__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
- unsigned long len)
+static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
{
struct extent_buffer *eb = NULL;
eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
eb->start = start;
- eb->len = len;
+ eb->len = fs_info->nodesize;
eb->fs_info = fs_info;
init_rwsem(&eb->lock);
@@ -2684,7 +2857,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
spin_lock_init(&eb->refs_lock);
atomic_set(&eb->refs, 1);
- ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
+ ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE);
return eb;
}
@@ -2692,10 +2865,9 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
{
struct extent_buffer *new;
- int num_folios = num_extent_folios(src);
int ret;
- new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
+ new = __alloc_extent_buffer(src->fs_info, src->start);
if (new == NULL)
return NULL;
@@ -2712,7 +2884,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
return NULL;
}
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(src); i++) {
struct folio *folio = new->folios[i];
ret = attach_extent_buffer_folio(new, folio, NULL);
@@ -2728,26 +2900,24 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
return new;
}
-struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, unsigned long len)
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
{
struct extent_buffer *eb;
- int num_folios = 0;
int ret;
- eb = __alloc_extent_buffer(fs_info, start, len);
+ eb = __alloc_extent_buffer(fs_info, start);
if (!eb)
return NULL;
ret = alloc_eb_folio_array(eb, false);
if (ret)
- goto err;
+ goto out;
- num_folios = num_extent_folios(eb);
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(eb); i++) {
ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL);
if (ret < 0)
- goto err;
+ goto out_detach;
}
set_extent_buffer_uptodate(eb);
@@ -2755,23 +2925,19 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
return eb;
-err:
- for (int i = 0; i < num_folios; i++) {
+
+out_detach:
+ for (int i = 0; i < num_extent_folios(eb); i++) {
if (eb->folios[i]) {
detach_extent_buffer_folio(eb, eb->folios[i]);
folio_put(eb->folios[i]);
}
}
- __free_extent_buffer(eb);
+out:
+ kmem_cache_free(extent_buffer_cache, eb);
return NULL;
}
-struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start)
-{
- return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
-}
-
static void check_buffer_tree_ref(struct extent_buffer *eb)
{
int refs;
@@ -2810,11 +2976,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
static void mark_extent_buffer_accessed(struct extent_buffer *eb)
{
- int num_folios= num_extent_folios(eb);
-
check_buffer_tree_ref(eb);
- for (int i = 0; i < num_folios; i++)
+ for (int i = 0; i < num_extent_folios(eb); i++)
folio_mark_accessed(eb->folios[i]);
}
@@ -2847,10 +3011,10 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
return eb;
}
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start)
{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *eb, *exists = NULL;
int ret;
@@ -2886,23 +3050,25 @@ again:
free_eb:
btrfs_release_extent_buffer(eb);
return exists;
-}
+#else
+ /* Stub to avoid linker error when compiled with optimizations turned off. */
+ return NULL;
#endif
+}
-static struct extent_buffer *grab_extent_buffer(
- struct btrfs_fs_info *fs_info, struct page *page)
+static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info,
+ struct folio *folio)
{
- struct folio *folio = page_folio(page);
struct extent_buffer *exists;
- lockdep_assert_held(&page->mapping->i_private_lock);
+ lockdep_assert_held(&folio->mapping->i_private_lock);
/*
* For subpage case, we completely rely on radix tree to ensure we
* don't try to insert two ebs for the same bytenr. So here we always
* return NULL and just continue.
*/
- if (fs_info->nodesize < PAGE_SIZE)
+ if (btrfs_meta_is_subpage(fs_info))
return NULL;
/* Page not yet attached to an extent buffer */
@@ -2910,7 +3076,7 @@ static struct extent_buffer *grab_extent_buffer(
return NULL;
/*
- * We could have already allocated an eb for this page and attached one
+ * We could have already allocated an eb for this folio and attached one
* so lets see if we can get a ref on the existing eb, and if we can we
* know it's good and we can just return that one, else we know we can
* just overwrite folio private.
@@ -2919,16 +3085,19 @@ static struct extent_buffer *grab_extent_buffer(
if (atomic_inc_not_zero(&exists->refs))
return exists;
- WARN_ON(PageDirty(page));
+ WARN_ON(folio_test_dirty(folio));
folio_detach_private(folio);
return NULL;
}
-static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
+/*
+ * Validate alignment constraints of eb at logical address @start.
+ */
+static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
{
if (!IS_ALIGNED(start, fs_info->sectorsize)) {
btrfs_err(fs_info, "bad tree block start %llu", start);
- return -EINVAL;
+ return true;
}
if (fs_info->nodesize < PAGE_SIZE &&
@@ -2936,14 +3105,14 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
btrfs_err(fs_info,
"tree block crosses page boundary, start %llu nodesize %u",
start, fs_info->nodesize);
- return -EINVAL;
+ return true;
}
if (fs_info->nodesize >= PAGE_SIZE &&
!PAGE_ALIGNED(start)) {
btrfs_err(fs_info,
"tree block is not page aligned, start %llu nodesize %u",
start, fs_info->nodesize);
- return -EINVAL;
+ return true;
}
if (!IS_ALIGNED(start, fs_info->nodesize) &&
!test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) {
@@ -2951,10 +3120,9 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
"tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance",
start, fs_info->nodesize);
}
- return 0;
+ return false;
}
-
/*
* Return 0 if eb->folios[i] is attached to btree inode successfully.
* Return >0 if there is already another extent buffer for the range,
@@ -3003,15 +3171,14 @@ retry:
finish:
spin_lock(&mapping->i_private_lock);
- if (existing_folio && fs_info->nodesize < PAGE_SIZE) {
+ if (existing_folio && btrfs_meta_is_subpage(fs_info)) {
/* We're going to reuse the existing page, can drop our folio now. */
__free_page(folio_page(eb->folios[i], 0));
eb->folios[i] = existing_folio;
} else if (existing_folio) {
struct extent_buffer *existing_eb;
- existing_eb = grab_extent_buffer(fs_info,
- folio_page(existing_folio, 0));
+ existing_eb = grab_extent_buffer(fs_info, existing_folio);
if (existing_eb) {
/* The extent buffer still exists, we can use it directly. */
*found_eb_ret = existing_eb;
@@ -3046,8 +3213,6 @@ finish:
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, u64 owner_root, int level)
{
- unsigned long len = fs_info->nodesize;
- int num_folios;
int attached = 0;
struct extent_buffer *eb;
struct extent_buffer *existing_eb = NULL;
@@ -3075,7 +3240,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
if (eb)
return eb;
- eb = __alloc_extent_buffer(fs_info, start, len);
+ eb = __alloc_extent_buffer(fs_info, start);
if (!eb)
return ERR_PTR(-ENOMEM);
@@ -3095,8 +3260,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
* The memory will be freed by attach_extent_buffer_page() or freed
* manually if we exit earlier.
*/
- if (fs_info->nodesize < PAGE_SIZE) {
- prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
+ if (btrfs_meta_is_subpage(fs_info)) {
+ prealloc = btrfs_alloc_subpage(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA);
if (IS_ERR(prealloc)) {
ret = PTR_ERR(prealloc);
goto out;
@@ -3111,9 +3276,8 @@ reallocate:
goto out;
}
- num_folios = num_extent_folios(eb);
/* Attach all pages to the filemap. */
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio;
ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb);
@@ -3153,7 +3317,7 @@ reallocate:
* and free the allocated page.
*/
folio = eb->folios[i];
- WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len));
+ WARN_ON(btrfs_meta_folio_test_dirty(folio, eb));
/*
* Check if the current page is physically contiguous with previous eb
@@ -3164,7 +3328,7 @@ reallocate:
if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0))
page_contig = false;
- if (!btrfs_folio_test_uptodate(fs_info, folio, eb->start, eb->len))
+ if (!btrfs_meta_folio_test_uptodate(folio, eb))
uptodate = 0;
/*
@@ -3207,8 +3371,8 @@ again:
* btree_release_folio will correctly detect that a page belongs to a
* live buffer and won't free them prematurely.
*/
- for (int i = 0; i < num_folios; i++)
- unlock_page(folio_page(eb->folios[i], 0));
+ for (int i = 0; i < num_extent_folios(eb); i++)
+ folio_unlock(eb->folios[i]);
return eb;
out:
@@ -3232,13 +3396,13 @@ out:
for (int i = 0; i < attached; i++) {
ASSERT(eb->folios[i]);
detach_extent_buffer_folio(eb, eb->folios[i]);
- unlock_page(folio_page(eb->folios[i], 0));
+ folio_unlock(eb->folios[i]);
folio_put(eb->folios[i]);
eb->folios[i] = NULL;
}
/*
* Now all pages of that extent buffer is unmapped, set UNMAPPED flag,
- * so it can be cleaned up without utilizing page->mapping.
+ * so it can be cleaned up without utilizing folio->mapping.
*/
set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
@@ -3254,7 +3418,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
struct extent_buffer *eb =
container_of(head, struct extent_buffer, rcu_head);
- __free_extent_buffer(eb);
+ kmem_cache_free(extent_buffer_cache, eb);
}
static int release_extent_buffer(struct extent_buffer *eb)
@@ -3278,11 +3442,11 @@ static int release_extent_buffer(struct extent_buffer *eb)
}
btrfs_leak_debug_del_eb(eb);
- /* Should be safe to release our pages at this point */
- btrfs_release_extent_buffer_pages(eb);
+ /* Should be safe to release folios at this point. */
+ btrfs_release_extent_buffer_folios(eb);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
- __free_extent_buffer(eb);
+ kmem_cache_free(extent_buffer_cache, eb);
return 1;
}
#endif
@@ -3338,38 +3502,21 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
release_extent_buffer(eb);
}
-static void btree_clear_folio_dirty(struct folio *folio)
+static void btree_clear_folio_dirty_tag(struct folio *folio)
{
- ASSERT(folio_test_dirty(folio));
+ ASSERT(!folio_test_dirty(folio));
ASSERT(folio_test_locked(folio));
- folio_clear_dirty_for_io(folio);
xa_lock_irq(&folio->mapping->i_pages);
if (!folio_test_dirty(folio))
- __xa_clear_mark(&folio->mapping->i_pages,
- folio_index(folio), PAGECACHE_TAG_DIRTY);
+ __xa_clear_mark(&folio->mapping->i_pages, folio->index,
+ PAGECACHE_TAG_DIRTY);
xa_unlock_irq(&folio->mapping->i_pages);
}
-static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
-{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- struct folio *folio = eb->folios[0];
- bool last;
-
- /* btree_clear_folio_dirty() needs page locked. */
- folio_lock(folio);
- last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len);
- if (last)
- btree_clear_folio_dirty(folio);
- folio_unlock(folio);
- WARN_ON(atomic_read(&eb->refs) == 0);
-}
-
void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int num_folios;
btrfs_assert_tree_write_locked(eb);
@@ -3396,17 +3543,16 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len,
fs_info->dirty_metadata_batch);
- if (eb->fs_info->nodesize < PAGE_SIZE)
- return clear_subpage_extent_buffer_dirty(eb);
-
- num_folios = num_extent_folios(eb);
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio = eb->folios[i];
+ bool last;
if (!folio_test_dirty(folio))
continue;
folio_lock(folio);
- btree_clear_folio_dirty(folio);
+ last = btrfs_meta_folio_clear_and_test_dirty(folio, eb);
+ if (last)
+ btree_clear_folio_dirty_tag(folio);
folio_unlock(folio);
}
WARN_ON(atomic_read(&eb->refs) == 0);
@@ -3414,92 +3560,66 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
void set_extent_buffer_dirty(struct extent_buffer *eb)
{
- int num_folios;
bool was_dirty;
check_buffer_tree_ref(eb);
was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
- num_folios = num_extent_folios(eb);
WARN_ON(atomic_read(&eb->refs) == 0);
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags));
if (!was_dirty) {
- bool subpage = eb->fs_info->nodesize < PAGE_SIZE;
+ bool subpage = btrfs_meta_is_subpage(eb->fs_info);
/*
* For subpage case, we can have other extent buffers in the
- * same page, and in clear_subpage_extent_buffer_dirty() we
+ * same page, and in clear_extent_buffer_dirty() we
* have to clear page dirty without subpage lock held.
* This can cause race where our page gets dirty cleared after
* we just set it.
*
- * Thankfully, clear_subpage_extent_buffer_dirty() has locked
+ * Thankfully, clear_extent_buffer_dirty() has locked
* its page for other reasons, we can use page lock to prevent
* the above race.
*/
if (subpage)
- lock_page(folio_page(eb->folios[0], 0));
- for (int i = 0; i < num_folios; i++)
- btrfs_folio_set_dirty(eb->fs_info, eb->folios[i],
- eb->start, eb->len);
+ folio_lock(eb->folios[0]);
+ for (int i = 0; i < num_extent_folios(eb); i++)
+ btrfs_meta_folio_set_dirty(eb->folios[i], eb);
if (subpage)
- unlock_page(folio_page(eb->folios[0], 0));
+ folio_unlock(eb->folios[0]);
percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes,
eb->len,
eb->fs_info->dirty_metadata_batch);
}
#ifdef CONFIG_BTRFS_DEBUG
- for (int i = 0; i < num_folios; i++)
+ for (int i = 0; i < num_extent_folios(eb); i++)
ASSERT(folio_test_dirty(eb->folios[i]));
#endif
}
void clear_extent_buffer_uptodate(struct extent_buffer *eb)
{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- int num_folios = num_extent_folios(eb);
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio = eb->folios[i];
if (!folio)
continue;
- /*
- * This is special handling for metadata subpage, as regular
- * btrfs_is_subpage() can not handle cloned/dummy metadata.
- */
- if (fs_info->nodesize >= PAGE_SIZE)
- folio_clear_uptodate(folio);
- else
- btrfs_subpage_clear_uptodate(fs_info, folio,
- eb->start, eb->len);
+ btrfs_meta_folio_clear_uptodate(folio, eb);
}
}
void set_extent_buffer_uptodate(struct extent_buffer *eb)
{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- int num_folios = num_extent_folios(eb);
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- for (int i = 0; i < num_folios; i++) {
- struct folio *folio = eb->folios[i];
-
- /*
- * This is special handling for metadata subpage, as regular
- * btrfs_is_subpage() can not handle cloned/dummy metadata.
- */
- if (fs_info->nodesize >= PAGE_SIZE)
- folio_mark_uptodate(folio);
- else
- btrfs_subpage_set_uptodate(fs_info, folio,
- eb->start, eb->len);
- }
+ for (int i = 0; i < num_extent_folios(eb); i++)
+ btrfs_meta_folio_set_uptodate(eb->folios[i], eb);
}
static void clear_extent_buffer_reading(struct extent_buffer *eb)
@@ -3512,10 +3632,7 @@ static void clear_extent_buffer_reading(struct extent_buffer *eb)
static void end_bbio_meta_read(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
- struct btrfs_fs_info *fs_info = eb->fs_info;
bool uptodate = !bbio->bio.bi_status;
- struct folio_iter fi;
- u32 bio_offset = 0;
/*
* If the extent buffer is marked UPTODATE before the read operation
@@ -3537,30 +3654,16 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio)
set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
}
- bio_for_each_folio_all(fi, &bbio->bio) {
- struct folio *folio = fi.folio;
- u64 start = eb->start + bio_offset;
- u32 len = fi.length;
-
- if (uptodate)
- btrfs_folio_set_uptodate(fs_info, folio, start, len);
- else
- btrfs_folio_clear_uptodate(fs_info, folio, start, len);
-
- bio_offset += len;
- }
-
clear_extent_buffer_reading(eb);
free_extent_buffer(eb);
bio_put(&bbio->bio);
}
-int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
- const struct btrfs_tree_parent_check *check)
+int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num,
+ const struct btrfs_tree_parent_check *check)
{
struct btrfs_bio *bbio;
- bool ret;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
@@ -3575,7 +3678,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
/* Someone else is already reading the buffer, just wait for it. */
if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags))
- goto done;
+ return 0;
/*
* Between the initial test_bit(EXTENT_BUFFER_UPTODATE) and the above
@@ -3600,29 +3703,31 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
bbio->file_offset = eb->start;
memcpy(&bbio->parent_check, check, sizeof(*check));
- if (eb->fs_info->nodesize < PAGE_SIZE) {
- ret = bio_add_folio(&bbio->bio, eb->folios[0], eb->len,
- eb->start - folio_pos(eb->folios[0]));
- ASSERT(ret);
- } else {
- int num_folios = num_extent_folios(eb);
-
- for (int i = 0; i < num_folios; i++) {
- struct folio *folio = eb->folios[i];
+ for (int i = 0; i < num_extent_folios(eb); i++) {
+ struct folio *folio = eb->folios[i];
+ u64 range_start = max_t(u64, eb->start, folio_pos(folio));
+ u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio),
+ eb->start + eb->len) - range_start;
- ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
- ASSERT(ret);
- }
+ bio_add_folio_nofail(&bbio->bio, folio, range_len,
+ offset_in_folio(folio, range_start));
}
btrfs_submit_bbio(bbio, mirror_num);
+ return 0;
+}
-done:
- if (wait == WAIT_COMPLETE) {
- wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE);
- if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
- return -EIO;
- }
+int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num,
+ const struct btrfs_tree_parent_check *check)
+{
+ int ret;
+
+ ret = read_extent_buffer_pages_nowait(eb, mirror_num, check);
+ if (ret < 0)
+ return ret;
+ wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE);
+ if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
+ return -EIO;
return 0;
}
@@ -3794,7 +3899,7 @@ static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i)
if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
return;
- if (fs_info->nodesize < PAGE_SIZE) {
+ if (btrfs_meta_is_subpage(fs_info)) {
folio = eb->folios[0];
ASSERT(i == 0);
if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio,
@@ -4280,7 +4385,7 @@ int try_release_extent_buffer(struct folio *folio)
{
struct extent_buffer *eb;
- if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE)
+ if (btrfs_meta_is_subpage(folio_to_fs_info(folio)))
return try_release_subpage_extent_buffer(folio);
/*
@@ -4353,7 +4458,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
return;
}
- ret = read_extent_buffer_pages(eb, WAIT_NONE, 0, &check);
+ ret = read_extent_buffer_pages_nowait(eb, 0, &check);
if (ret < 0)
free_extent_buffer_stale(eb);
else