From 4f736d4f97bb18f86efd844d937b035ca2db7841 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 2 Jul 2020 12:31:59 +0100 Subject: btrfs: only commit the delayed inode when doing a full fsync commit 8c8648dd1f6d62aeb912deeb788b6ac33cb782e7 upstream. Commit 2c2c452b0cafdc ("Btrfs: fix fsync when extend references are added to an inode") forced a commit of the delayed inode when logging an inode in order to ensure we would end up logging the inode item during a full fsync. By committing the delayed inode, we updated the inode item in the fs/subvolume tree and then later when copying items from leafs modified in the current transaction into the log tree (with copy_inode_items_to_log()) we ended up copying the inode item from the fs/subvolume tree into the log tree. Logging an up to date version of the inode item is required to make sure at log replay time we get the link count fixup triggered among other things (replay xattr deletes, etc). The test case generic/040 from fstests exercises the bug which that commit fixed. However for a fast fsync we don't need to commit the delayed inode because we always log an up to date version of the inode item based on the struct btrfs_inode we have in-memory. We started doing this for fast fsyncs since commit e4545de5b035c7 ("Btrfs: fix fsync data loss after append write"). So just stop committing the delayed inode if we are doing a fast fsync, we are only wasting time and adding contention on fs/subvolume tree. This patch is part of a series that has the following patches: 1/4 btrfs: only commit the delayed inode when doing a full fsync 2/4 btrfs: only commit delayed items at fsync if we are logging a directory 3/4 btrfs: stop incremening log_batch for the log root tree when syncing log 4/4 btrfs: remove no longer needed use of log_writers for the log root tree After the entire patchset applied I saw about 12% decrease on max latency reported by dbench. The test was done on a qemu vm, with 8 cores, 16Gb of ram, using kvm and using a raw NVMe device directly (no intermediary fs on the host). The test was invoked like the following: mkfs.btrfs -f /dev/sdk mount -o ssd -o nospace_cache /dev/sdk /mnt/sdk dbench -D /mnt/sdk -t 300 8 umount /mnt/dsk CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-log.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index cd5348f352dd..8adafc5debeb 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5130,7 +5130,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key max_key; struct btrfs_root *log = root->log_root; int err = 0; - int ret; + int ret = 0; bool fast_search = false; u64 ino = btrfs_ino(inode); struct extent_map_tree *em_tree = &inode->extent_tree; @@ -5167,14 +5167,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, /* * Only run delayed items if we are a dir or a new file. - * Otherwise commit the delayed inode only, which is needed in - * order for the log replay code to mark inodes for link count - * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). + * Otherwise commit the delayed inode only if the full sync flag is set, + * as we want to make sure an up to date version is in the subvolume + * tree so copy_inode_items_to_log() / copy_items() can find it and copy + * it to the log tree. For a non full sync, we always log the inode item + * based on the in-memory struct btrfs_inode which is always up to date. */ if (S_ISDIR(inode->vfs_inode.i_mode) || inode->generation > fs_info->last_trans_committed) ret = btrfs_commit_inode_delayed_items(trans, inode); - else + else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) ret = btrfs_commit_inode_delayed_inode(inode); if (ret) { -- cgit v1.2.3 From 99da62f72cdfd44bf51a7ae49ae056e33af07852 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 2 Jul 2020 12:32:31 +0100 Subject: btrfs: stop incremening log_batch for the log root tree when syncing log commit 28a9579561bcb9082715e720eac93012e708ab94 upstream. We are incrementing the log_batch atomic counter of the root log tree but we never use that counter, it's used only for the log trees of subvolume roots. We started doing it when we moved the log_batch and log_write counters from the global, per fs, btrfs_fs_info structure, into the btrfs_root structure in commit 7237f1833601dc ("Btrfs: fix tree logs parallel sync"). So just stop doing it for the log root tree and add a comment over the field declaration so inform it's used only for log trees of subvolume roots. This patch is part of a series that has the following patches: 1/4 btrfs: only commit the delayed inode when doing a full fsync 2/4 btrfs: only commit delayed items at fsync if we are logging a directory 3/4 btrfs: stop incremening log_batch for the log root tree when syncing log 4/4 btrfs: remove no longer needed use of log_writers for the log root tree After the entire patchset applied I saw about 12% decrease on max latency reported by dbench. The test was done on a qemu vm, with 8 cores, 16Gb of ram, using kvm and using a raw NVMe device directly (no intermediary fs on the host). The test was invoked like the following: mkfs.btrfs -f /dev/sdk mount -o ssd -o nospace_cache /dev/sdk /mnt/sdk dbench -D /mnt/sdk -t 300 8 umount /mnt/dsk CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/ctree.h | 1 + fs/btrfs/tree-log.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7c8efa0c3ee6..a217f8557b6d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1061,6 +1061,7 @@ struct btrfs_root { struct list_head log_ctxs[2]; atomic_t log_writers; atomic_t log_commit[2]; + /* Used only for log trees of subvolumes, not for the log root tree */ atomic_t log_batch; int log_transid; /* No matter the commit succeeds or not*/ diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8adafc5debeb..f75b48b93c0d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3116,7 +3116,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_init_log_ctx(&root_log_ctx, NULL); mutex_lock(&log_root_tree->log_mutex); - atomic_inc(&log_root_tree->log_batch); atomic_inc(&log_root_tree->log_writers); index2 = log_root_tree->log_transid % 2; -- cgit v1.2.3 From 39b0de06b4e52556c29e670ae226f5d5705baf55 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 2 Jul 2020 12:32:20 +0100 Subject: btrfs: only commit delayed items at fsync if we are logging a directory commit 5aa7d1a7f4a2f8ca6be1f32415e9365d026e8fa7 upstream. When logging an inode we are committing its delayed items if either the inode is a directory or if it is a new inode, created in the current transaction. We need to do it for directories, since new directory indexes are stored as delayed items of the inode and when logging a directory we need to be able to access all indexes from the fs/subvolume tree in order to figure out which index ranges need to be logged. However for new inodes that are not directories, we do not need to do it because the only type of delayed item they can have is the inode item, and we are guaranteed to always log an up to date version of the inode item: *) for a full fsync we do it by committing the delayed inode and then copying the item from the fs/subvolume tree with copy_inode_items_to_log(); *) for a fast fsync we always log the inode item based on the contents of the in-memory struct btrfs_inode. We guarantee this is always done since commit e4545de5b035c7 ("Btrfs: fix fsync data loss after append write"). So stop running delayed items for a new inodes that are not directories, since that forces committing the delayed inode into the fs/subvolume tree, wasting time and adding contention to the tree when a full fsync is not required. We will only do it in case a fast fsync is needed. This patch is part of a series that has the following patches: 1/4 btrfs: only commit the delayed inode when doing a full fsync 2/4 btrfs: only commit delayed items at fsync if we are logging a directory 3/4 btrfs: stop incremening log_batch for the log root tree when syncing log 4/4 btrfs: remove no longer needed use of log_writers for the log root tree After the entire patchset applied I saw about 12% decrease on max latency reported by dbench. The test was done on a qemu vm, with 8 cores, 16Gb of ram, using kvm and using a raw NVMe device directly (no intermediary fs on the host). The test was invoked like the following: mkfs.btrfs -f /dev/sdk mount -o ssd -o nospace_cache /dev/sdk /mnt/sdk dbench -D /mnt/sdk -t 300 8 umount /mnt/dsk CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-log.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f75b48b93c0d..9f54e52d36c1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5122,7 +5122,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, const loff_t end, struct btrfs_log_ctx *ctx) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct btrfs_path *dst_path; struct btrfs_key min_key; @@ -5165,15 +5164,17 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, max_key.offset = (u64)-1; /* - * Only run delayed items if we are a dir or a new file. + * Only run delayed items if we are a directory. We want to make sure + * all directory indexes hit the fs/subvolume tree so we can find them + * and figure out which index ranges have to be logged. + * * Otherwise commit the delayed inode only if the full sync flag is set, * as we want to make sure an up to date version is in the subvolume * tree so copy_inode_items_to_log() / copy_items() can find it and copy * it to the log tree. For a non full sync, we always log the inode item * based on the in-memory struct btrfs_inode which is always up to date. */ - if (S_ISDIR(inode->vfs_inode.i_mode) || - inode->generation > fs_info->last_trans_committed) + if (S_ISDIR(inode->vfs_inode.i_mode)) ret = btrfs_commit_inode_delayed_items(trans, inode); else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) ret = btrfs_commit_inode_delayed_inode(inode); -- cgit v1.2.3 From bbdfe026a7676e459f84a2aee0602055eaad884b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 2 Jul 2020 12:32:40 +0100 Subject: btrfs: remove no longer needed use of log_writers for the log root tree commit a93e01682e283f6de09d6ce8f805dc52a2e942fb upstream. When syncing the log, we used to update the log root tree without holding neither the log_mutex of the subvolume root nor the log_mutex of log root tree. We used to have two critical sections delimited by the log_mutex of the log root tree, so in the first one we incremented the log_writers of the log root tree and on the second one we decremented it and waited for the log_writers counter to go down to zero. This was because the update of the log root tree happened between the two critical sections. The use of two critical sections allowed a little bit more of parallelism and required the use of the log_writers counter, necessary to make sure we didn't miss any log root tree update when we have multiple tasks trying to sync the log in parallel. However after commit 06989c799f0481 ("Btrfs: fix race updating log root item during fsync") the log root tree update was moved into a critical section delimited by the subvolume's log_mutex. Later another commit moved the log tree update from that critical section into the second critical section delimited by the log_mutex of the log root tree. Both commits addressed different bugs. The end result is that the first critical section delimited by the log_mutex of the log root tree became pointless, since there's nothing done between it and the second critical section, we just have an unlock of the log_mutex followed by a lock operation. This means we can merge both critical sections, as the first one does almost nothing now, and we can stop using the log_writers counter of the log root tree, which was incremented in the first critical section and decremented in the second criticial section, used to make sure no one in the second critical section started writeback of the log root tree before some other task updated it. So just remove the mutex_unlock() followed by mutex_lock() of the log root tree, as well as the use of the log_writers counter for the log root tree. This patch is part of a series that has the following patches: 1/4 btrfs: only commit the delayed inode when doing a full fsync 2/4 btrfs: only commit delayed items at fsync if we are logging a directory 3/4 btrfs: stop incremening log_batch for the log root tree when syncing log 4/4 btrfs: remove no longer needed use of log_writers for the log root tree After the entire patchset applied I saw about 12% decrease on max latency reported by dbench. The test was done on a qemu vm, with 8 cores, 16Gb of ram, using kvm and using a raw NVMe device directly (no intermediary fs on the host). The test was invoked like the following: mkfs.btrfs -f /dev/sdk mount -o ssd -o nospace_cache /dev/sdk /mnt/sdk dbench -D /mnt/sdk -t 300 8 umount /mnt/dsk CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/ctree.h | 1 + fs/btrfs/tree-log.c | 13 ------------- 2 files changed, 1 insertion(+), 13 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a217f8557b6d..be5f0501280d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1059,6 +1059,7 @@ struct btrfs_root { wait_queue_head_t log_writer_wait; wait_queue_head_t log_commit_wait[2]; struct list_head log_ctxs[2]; + /* Used only for log trees of subvolumes, not for the log root tree */ atomic_t log_writers; atomic_t log_commit[2]; /* Used only for log trees of subvolumes, not for the log root tree */ diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9f54e52d36c1..bcd384825c78 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3116,28 +3116,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_init_log_ctx(&root_log_ctx, NULL); mutex_lock(&log_root_tree->log_mutex); - atomic_inc(&log_root_tree->log_writers); index2 = log_root_tree->log_transid % 2; list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); root_log_ctx.log_transid = log_root_tree->log_transid; - mutex_unlock(&log_root_tree->log_mutex); - - mutex_lock(&log_root_tree->log_mutex); - /* * Now we are safe to update the log_root_tree because we're under the * log_mutex, and we're a current writer so we're holding the commit * open until we drop the log_mutex. */ ret = update_log_root(trans, log, &new_root_item); - - if (atomic_dec_and_test(&log_root_tree->log_writers)) { - /* atomic_dec_and_test implies a barrier */ - cond_wake_up_nomb(&log_root_tree->log_writer_wait); - } - if (ret) { if (!list_empty(&root_log_ctx.list)) list_del_init(&root_log_ctx.list); @@ -3183,8 +3172,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, root_log_ctx.log_transid - 1); } - wait_for_writer(log_root_tree); - /* * now that we've moved on to the tree of log tree roots, * check the full commit flag again -- cgit v1.2.3 From 994f3284b2747f18fe0bca2230ed82c31c8dcb96 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 29 Jul 2020 10:17:50 +0100 Subject: btrfs: fix memory leaks after failure to lookup checksums during inode logging commit 4f26433e9b3eb7a55ed70d8f882ae9cd48ba448b upstream. While logging an inode, at copy_items(), if we fail to lookup the checksums for an extent we release the destination path, free the ins_data array and then return immediately. However a previous iteration of the for loop may have added checksums to the ordered_sums list, in which case we leak the memory used by them. So fix this by making sure we iterate the ordered_sums list and free all its checksums before returning. Fixes: 3650860b90cc2a ("Btrfs: remove almost all of the BUG()'s from tree-log.c") CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-log.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index bcd384825c78..d22ff1e0963c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4027,11 +4027,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, fs_info->csum_root, ds + cs, ds + cs + cl - 1, &ordered_sums, 0); - if (ret) { - btrfs_release_path(dst_path); - kfree(ins_data); - return ret; - } + if (ret) + break; } } } @@ -4044,7 +4041,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, * we have to do this after the loop above to avoid changing the * log tree while trying to change the log tree. */ - ret = 0; while (!list_empty(&ordered_sums)) { struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, struct btrfs_ordered_sum, -- cgit v1.2.3