diff options
author | Filipe Manana <fdmanana@suse.com> | 2023-01-11 14:36:20 +0300 |
---|---|---|
committer | David Sterba <dsterba@suse.com> | 2023-02-15 21:38:50 +0300 |
commit | 3e49363be6330f49e670240e8f46e6fe0bf5628a (patch) | |
tree | d7131e2f6e876bf0a65a36641f6b1ced6c658789 | |
parent | ace79df8a44ae1f668615c7177fa1a7bdc04af7e (diff) | |
download | linux-3e49363be6330f49e670240e8f46e6fe0bf5628a.tar.xz |
btrfs: send: cache utimes operations for directories if possible
Whenever we add or remove an entry to a directory, we issue an utimes
command for the directory. If we add 1000 entries to a directory (create
1000 files under it or move 1000 files to it), then we issue the same
utimes command 1000 times, which increases the send stream size, results
in more pipe IO, one search in the send b+tree, allocating one path for
the search, etc, as well as making the receiver do a system call for each
duplicated utimes command.
We also issue an utimes command when we create a new directory, but later
we might add entries to it corresponding to inodes with an higher inode
number, so it's pointless to issue the utimes command before we create
the last inode under the directory.
So use a lru cache to track directories for which we must send a utimes
command. When we need to remove an entry from the cache, we issue the
utimes command for the respective directory. When finishing the send
operation, we go over each cache element and issue the respective utimes
command. Finally the caching is entirely optional, just a performance
optimization, meaning that if we fail to cache (due to memory allocation
failure), we issue the utimes command right away, that is, we fallback
to the previous, unoptimized, behaviour.
This patch belongs to a patchset comprised of the following patches:
btrfs: send: directly return from did_overwrite_ref() and simplify it
btrfs: send: avoid unnecessary generation search at did_overwrite_ref()
btrfs: send: directly return from will_overwrite_ref() and simplify it
btrfs: send: avoid extra b+tree searches when checking reference overrides
btrfs: send: remove send_progress argument from can_rmdir()
btrfs: send: avoid duplicated orphan dir allocation and initialization
btrfs: send: avoid unnecessary orphan dir rbtree search at can_rmdir()
btrfs: send: reduce searches on parent root when checking if dir can be removed
btrfs: send: iterate waiting dir move rbtree only once when processing refs
btrfs: send: initialize all the red black trees earlier
btrfs: send: genericize the backref cache to allow it to be reused
btrfs: adapt lru cache to allow for 64 bits keys on 32 bits systems
btrfs: send: cache information about created directories
btrfs: allow a generation number to be associated with lru cache entries
btrfs: add an api to delete a specific entry from the lru cache
btrfs: send: use the lru cache to implement the name cache
btrfs: send: update size of roots array for backref cache entries
btrfs: send: cache utimes operations for directories if possible
The following test was run before and after applying the whole patchset,
and on a non-debug kernel (Debian's default kernel config):
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
mount $DEV $MNT
mkdir $MNT/A
for ((i = 1; i <= 20000; i++)); do
echo -n > $MNT/A/file_$i
done
btrfs subvolume snapshot -r $MNT $MNT/snap1
mkdir $MNT/B
for ((i = 20000; i <= 40000; i++)); do
echo -n > $MNT/B/file_$i
done
mv $MNT/A/file_* $MNT/B/
btrfs subvolume snapshot -r $MNT $MNT/snap2
start=$(date +%s%N)
btrfs send -p $MNT/snap1 $MNT/snap2 > /dev/null
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "Incremental send took $dur milliseconds"
umount $MNT
Before the whole patchset: 18408 milliseconds
After the whole patchset: 1942 milliseconds (9.5x speedup)
Using 60000 files instead of 40000:
Before the whole patchset: 39764 milliseconds
After the whole patchset: 3076 milliseconds (12.9x speedup)
Using 20000 files instead of 40000:
Before the whole patchset: 5072 milliseconds
After the whole patchset: 916 milliseconds (5.5x speedup)
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
-rw-r--r-- | fs/btrfs/lru_cache.c | 4 | ||||
-rw-r--r-- | fs/btrfs/lru_cache.h | 15 | ||||
-rw-r--r-- | fs/btrfs/send.c | 104 |
3 files changed, 117 insertions, 6 deletions
diff --git a/fs/btrfs/lru_cache.c b/fs/btrfs/lru_cache.c index 38722dc07676..0fe0ae54ac67 100644 --- a/fs/btrfs/lru_cache.c +++ b/fs/btrfs/lru_cache.c @@ -9,6 +9,8 @@ * * @cache: The cache. * @max_size: Maximum size (number of entries) for the cache. + * Use 0 for unlimited size, it's the user's responsability to + * trim the cache in that case. */ void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size) { @@ -129,7 +131,7 @@ int btrfs_lru_cache_store(struct btrfs_lru_cache *cache, return ret; } - if (cache->size == cache->max_size) { + if (cache->max_size > 0 && cache->size == cache->max_size) { struct btrfs_lru_cache_entry *lru_entry; lru_entry = list_first_entry(&cache->lru_list, diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h index a97206f04990..de3e18bce24a 100644 --- a/fs/btrfs/lru_cache.h +++ b/fs/btrfs/lru_cache.h @@ -47,11 +47,26 @@ struct btrfs_lru_cache { unsigned int max_size; }; +#define btrfs_lru_cache_for_each_entry_safe(cache, entry, tmp) \ + list_for_each_entry_safe_reverse((entry), (tmp), &(cache)->lru_list, lru_list) + static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *cache) { return cache->size; } +static inline bool btrfs_lru_cache_is_full(const struct btrfs_lru_cache *cache) +{ + return cache->size >= cache->max_size; +} + +static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry( + struct btrfs_lru_cache *cache) +{ + return list_first_entry_or_null(&cache->lru_list, + struct btrfs_lru_cache_entry, lru_list); +} + void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size); struct btrfs_lru_cache_entry *btrfs_lru_cache_lookup(struct btrfs_lru_cache *cache, u64 key, u64 gen); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 83c737ab19be..e5c963bb873d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -125,6 +125,14 @@ static_assert(offsetof(struct backref_cache_entry, entry) == 0); */ #define SEND_MAX_DIR_CREATED_CACHE_SIZE 64 +/* + * Max number of entries in the cache that stores directories that were already + * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses + * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but + * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64). + */ +#define SEND_MAX_DIR_UTIMES_CACHE_SIZE 64 + struct send_ctx { struct file *send_filp; loff_t send_off; @@ -296,6 +304,7 @@ struct send_ctx { u64 backref_cache_last_reloc_trans; struct btrfs_lru_cache dir_created_cache; + struct btrfs_lru_cache dir_utimes_cache; }; struct pending_dir_move { @@ -2750,6 +2759,63 @@ out: } /* + * If the cache is full, we can't remove entries from it and do a call to + * send_utimes() for each respective inode, because we might be finishing + * processing an inode that is a directory and it just got renamed, and existing + * entries in the cache may refer to inodes that have the directory in their + * full path - in which case we would generate outdated paths (pre-rename) + * for the inodes that the cache entries point to. Instead of prunning the + * cache when inserting, do it after we finish processing each inode at + * finish_inode_if_needed(). + */ +static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen) +{ + struct btrfs_lru_cache_entry *entry; + int ret; + + entry = btrfs_lru_cache_lookup(&sctx->dir_utimes_cache, dir, gen); + if (entry != NULL) + return 0; + + /* Caching is optional, don't fail if we can't allocate memory. */ + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return send_utimes(sctx, dir, gen); + + entry->key = dir; + entry->gen = gen; + + ret = btrfs_lru_cache_store(&sctx->dir_utimes_cache, entry, GFP_KERNEL); + ASSERT(ret != -EEXIST); + if (ret) { + kfree(entry); + return send_utimes(sctx, dir, gen); + } + + return 0; +} + +static int trim_dir_utimes_cache(struct send_ctx *sctx) +{ + while (btrfs_lru_cache_size(&sctx->dir_utimes_cache) > + SEND_MAX_DIR_UTIMES_CACHE_SIZE) { + struct btrfs_lru_cache_entry *lru; + int ret; + + lru = btrfs_lru_cache_lru_entry(&sctx->dir_utimes_cache); + ASSERT(lru != NULL); + + ret = send_utimes(sctx, lru->key, lru->gen); + if (ret) + return ret; + + btrfs_lru_cache_remove(&sctx->dir_utimes_cache, lru); + } + + return 0; +} + +/* * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have * a valid path yet because we did not process the refs yet. So, the inode * is created as orphan. @@ -3542,7 +3608,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) } finish: - ret = send_utimes(sctx, pm->ino, pm->gen); + ret = cache_dir_utimes(sctx, pm->ino, pm->gen); if (ret < 0) goto out; @@ -3562,7 +3628,7 @@ finish: if (ret < 0) goto out; - ret = send_utimes(sctx, cur->dir, cur->dir_gen); + ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; } @@ -4509,8 +4575,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (ret == inode_state_did_create || ret == inode_state_no_change) { - /* TODO delayed utimes */ - ret = send_utimes(sctx, cur->dir, cur->dir_gen); + ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; } else if (ret == inode_state_did_delete && @@ -6692,12 +6757,26 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) * it's moved/renamed, therefore we don't need to do it here. */ sctx->send_progress = sctx->cur_ino + 1; - ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + + /* + * If the current inode is a non-empty directory, delay issuing + * the utimes command for it, as it's very likely we have inodes + * with an higher number inside it. We want to issue the utimes + * command only after adding all dentries to it. + */ + if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_size > 0) + ret = cache_dir_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + else + ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + if (ret < 0) goto out; } out: + if (!ret) + ret = trim_dir_utimes_cache(sctx); + return ret; } @@ -7982,6 +8061,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) int clone_sources_to_rollback = 0; size_t alloc_size; int sort_clone_roots = 0; + struct btrfs_lru_cache_entry *entry; + struct btrfs_lru_cache_entry *tmp; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -8037,6 +8118,11 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) btrfs_lru_cache_init(&sctx->backref_cache, SEND_MAX_BACKREF_CACHE_SIZE); btrfs_lru_cache_init(&sctx->dir_created_cache, SEND_MAX_DIR_CREATED_CACHE_SIZE); + /* + * This cache is periodically trimmed to a fixed size elsewhere, see + * cache_dir_utimes() and trim_dir_utimes_cache(). + */ + btrfs_lru_cache_init(&sctx->dir_utimes_cache, 0); sctx->pending_dir_moves = RB_ROOT; sctx->waiting_dir_moves = RB_ROOT; @@ -8217,6 +8303,13 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) if (ret < 0) goto out; + btrfs_lru_cache_for_each_entry_safe(&sctx->dir_utimes_cache, entry, tmp) { + ret = send_utimes(sctx, entry->key, entry->gen); + if (ret < 0) + goto out; + btrfs_lru_cache_remove(&sctx->dir_utimes_cache, entry); + } + if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) { ret = begin_cmd(sctx, BTRFS_SEND_C_END); if (ret < 0) @@ -8301,6 +8394,7 @@ out: btrfs_lru_cache_clear(&sctx->name_cache); btrfs_lru_cache_clear(&sctx->backref_cache); btrfs_lru_cache_clear(&sctx->dir_created_cache); + btrfs_lru_cache_clear(&sctx->dir_utimes_cache); kfree(sctx); } |