summaryrefslogtreecommitdiff
path: root/fs/btrfs/file.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-02-21 21:00:39 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2021-02-21 21:00:39 +0300
commit6f3952cbe00b74739f540981d1afe84cd4dac879 (patch)
tree6dacecd7ea8dc06d985c7ea417391c1ae2aec574 /fs/btrfs/file.c
parentf9d58de23152f2c16f326d7e014cfa2933b00304 (diff)
parent9d294a685fbcb256ce8c5f7fd88a7596d0f52a8a (diff)
downloadlinux-6f3952cbe00b74739f540981d1afe84cd4dac879.tar.xz
Merge tag 'for-5.12-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba: "This brings updates of space handling, performance improvements or bug fixes. The subpage block size and zoned mode features have reached state where they're usable but with limitations. Performance or related: - do not block on deleted block group mutex in the cleaner, avoids some long stalls - improved flushing: make it work better with ticket space reservations and avoid excessive transaction commits in some scenarios, slightly improves throughput for random write load - preemptive background flushing: separate the logic from ticket reservations, improve the accounting and decisions when to flush in low space conditions - less lock contention related to running delayed refs, let just one thread do the flushing when there are many inside transaction commit - dbench workload improvements: avoid unnecessary work when logging inodes, fewer fallbacks to transaction commit and thus less waiting for it (+7% throughput, -20% latency) Core: - subpage block size - currently read-only support - refactor and generalize code where sectorsize is assumed to be page size, add the subpage handling everywhere - the read-write support is on the way, page sizes are still limited to 4K or 64K - zoned mode, first working version but with limitations - SMR/ZBC/ZNS friendly allocation mode, utilizing the "no fixed location for structures" and chunked allocation - superblock as the only fixed data structure needs special handling, uses 2 consecutive zones as a ring buffer - tree-log support with a dedicated block group to avoid unordered writes - emulated zones on non-zoned devices - not yet working - all non-single block group profiles, requires more zone write pointer synchronization between the multiple block groups - fitrim due to dependency on space cache, can be implemented Fixes: - ref-verify: proper tree owner and node level tracking - fix pinned byte accounting, causing some early ENOSPC now more likely due to other changes in delayed refs Other: - error handling fixes and improvements - more error injection points - more function documentation - more and updated tracepoints - subset of W=1 checked by default - update comments to allow more automatic kdoc parameter checks" * tag 'for-5.12-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (144 commits) btrfs: zoned: enable to mount ZONED incompat flag btrfs: zoned: deal with holes writing out tree-log pages btrfs: zoned: reorder log node allocation on zoned filesystem btrfs: zoned: serialize log transaction on zoned filesystems btrfs: zoned: extend zoned allocator to use dedicated tree-log block group btrfs: split alloc_log_tree() btrfs: zoned: relocate block group to repair IO failure in zoned filesystems btrfs: zoned: enable relocation on a zoned filesystem btrfs: zoned: support dev-replace in zoned filesystems btrfs: zoned: implement copying for zoned device-replace btrfs: zoned: implement cloning for zoned device-replace btrfs: zoned: mark block groups to copy for device-replace btrfs: zoned: do not use async metadata checksum on zoned filesystems btrfs: zoned: wait for existing extents before truncating btrfs: zoned: serialize metadata IO btrfs: zoned: introduce dedicated data write path for zoned filesystems btrfs: zoned: enable zone append writing for direct IO btrfs: zoned: use ZONE_APPEND write for zoned mode btrfs: save irq flags when looking up an ordered extent btrfs: zoned: cache if block group is on a sequential zone ...
Diffstat (limited to 'fs/btrfs/file.c')
-rw-r--r--fs/btrfs/file.c58
1 files changed, 27 insertions, 31 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0e41459b8de6..01a72f53fb5d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -453,12 +453,11 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
}
/*
- * after copy_from_user, pages need to be dirtied and we need to make
- * sure holes are created between the current EOF and the start of
- * any next extents (if required).
- *
- * this also makes the decision about creating an inline extent vs
- * doing real data extents, marking pages dirty and delalloc as required.
+ * After btrfs_copy_from_user(), update the following things for delalloc:
+ * - Mark newly dirtied pages as DELALLOC in the io tree.
+ * Used to advise which range is to be written back.
+ * - Mark modified pages as Uptodate/Dirty and not needing COW fixup
+ * - Update inode size for past EOF write
*/
int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
@@ -1370,6 +1369,12 @@ again:
goto fail;
}
+ err = set_page_extent_mapped(pages[i]);
+ if (err < 0) {
+ faili = i;
+ goto fail;
+ }
+
if (i == 0)
err = prepare_uptodate_page(inode, pages[i], pos,
force_uptodate);
@@ -1454,23 +1459,11 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
}
/*
- * It's possible the pages are dirty right now, but we don't want
- * to clean them yet because copy_from_user may catch a page fault
- * and we might have to fall back to one page at a time. If that
- * happens, we'll unlock these pages and we'd have a window where
- * reclaim could sneak in and drop the once-dirty page on the floor
- * without writing it.
- *
- * We have the pages locked and the extent range locked, so there's
- * no way someone can start IO on any dirty pages in this range.
- *
- * We'll call btrfs_dirty_pages() later on, and that will flip around
- * delalloc bits and dirty the pages as required.
+ * We should be called after prepare_pages() which should have locked
+ * all pages in the range.
*/
- for (i = 0; i < num_pages; i++) {
- set_page_extent_mapped(pages[i]);
+ for (i = 0; i < num_pages; i++)
WARN_ON(!PageLocked(pages[i]));
- }
return ret;
}
@@ -1997,9 +1990,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_inode *inode = BTRFS_I(file_inode(file));
ssize_t num_written = 0;
const bool sync = iocb->ki_flags & IOCB_DSYNC;
@@ -2008,7 +1999,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
* have opened a file as writable, we have to stop this write operation
* to ensure consistency.
*/
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state))
return -EROFS;
if (!(iocb->ki_flags & IOCB_DIRECT) &&
@@ -2016,7 +2007,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
return -EOPNOTSUPP;
if (sync)
- atomic_inc(&BTRFS_I(inode)->sync_writers);
+ atomic_inc(&inode->sync_writers);
if (iocb->ki_flags & IOCB_DIRECT)
num_written = btrfs_direct_write(iocb, from);
@@ -2028,14 +2019,14 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
* otherwise subsequent syncs to a file that's been synced in this
* transaction will appear to have already occurred.
*/
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->last_sub_trans = root->log_transid;
- spin_unlock(&BTRFS_I(inode)->lock);
+ spin_lock(&inode->lock);
+ inode->last_sub_trans = inode->root->log_transid;
+ spin_unlock(&inode->lock);
if (num_written > 0)
num_written = generic_write_sync(iocb, num_written);
if (sync)
- atomic_dec(&BTRFS_I(inode)->sync_writers);
+ atomic_dec(&inode->sync_writers);
current->backing_dev_info = NULL;
return num_written;
@@ -2177,8 +2168,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* commit waits for their completion, to avoid data loss if we fsync,
* the current transaction commits before the ordered extents complete
* and a power failure happens right after that.
+ *
+ * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
+ * logical address recorded in the ordered extent may change. We need
+ * to wait for the IO to stabilize the logical address.
*/
- if (full_sync) {
+ if (full_sync || btrfs_is_zoned(fs_info)) {
ret = btrfs_wait_ordered_range(inode, start, len);
} else {
/*
@@ -2241,6 +2236,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = PTR_ERR(trans);
goto out_release_extents;
}
+ trans->in_fsync = true;
ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
btrfs_release_log_ctx_extents(&ctx);