summaryrefslogtreecommitdiff
path: root/fs/ext4/orphan.c
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2021-08-16 12:57:08 +0300
committerTheodore Ts'o <tytso@mit.edu>2021-08-31 06:36:51 +0300
commit4a79a98c7b19dd3d4cfe9200fbc384ba9119e039 (patch)
tree85a3fa57a42cbd80394058461312c9c007525f96 /fs/ext4/orphan.c
parent3a6541e97c035dba90cdf37169d73b2d8057e55d (diff)
downloadlinux-4a79a98c7b19dd3d4cfe9200fbc384ba9119e039.tar.xz
ext4: Improve scalability of ext4 orphan file handling
Even though the length of the critical section when adding / removing orphaned inodes was significantly reduced by using orphan file, the contention of lock protecting orphan file still appears high in profiles for truncate / unlink intensive workloads with high number of threads. This patch makes handling of orphan file completely lockless. Also to reduce conflicts between CPUs different CPUs start searching for empty slot in orphan file in different blocks. Performance comparison of locked orphan file handling, lockless orphan file handling, and completely disabled orphan inode handling from 80 CPU Xeon Server with 526 GB of RAM, filesystem located on SAS SSD disk, average of 5 runs: stress-orphan (microbenchmark truncating files byte-by-byte from N processes in parallel) Threads Time Time Time Orphan locked Orphan lockless No orphan 1 0.945600 0.939400 0.891200 2 1.331800 1.246600 1.174400 4 1.995000 1.780600 1.713200 8 6.424200 4.900000 4.106000 16 14.937600 8.516400 8.138000 32 33.038200 24.565600 24.002200 64 60.823600 39.844600 38.440200 128 122.941400 70.950400 69.315000 So we can see that with lockless orphan file handling, addition / deletion of orphaned inodes got almost completely out of picture even for a microbenchmark stressing it. For reaim creat_clo workload on ramdisk there are also noticeable gains (average of 5 runs): Clients Vanilla (ops/s) Patched (ops/s) creat_clo-1 14705.88 ( 0.00%) 14354.07 * -2.39%* creat_clo-3 27108.43 ( 0.00%) 28301.89 ( 4.40%) creat_clo-5 37406.48 ( 0.00%) 45180.73 * 20.78%* creat_clo-7 41338.58 ( 0.00%) 54687.50 * 32.29%* creat_clo-9 45226.13 ( 0.00%) 62937.07 * 39.16%* creat_clo-11 44000.00 ( 0.00%) 65088.76 * 47.93%* creat_clo-13 36516.85 ( 0.00%) 68661.97 * 88.03%* creat_clo-15 30864.20 ( 0.00%) 69551.78 * 125.35%* creat_clo-17 27478.45 ( 0.00%) 67729.08 * 146.48%* creat_clo-19 25000.00 ( 0.00%) 61621.62 * 146.49%* creat_clo-21 18772.35 ( 0.00%) 63829.79 * 240.02%* creat_clo-23 16698.94 ( 0.00%) 61938.96 * 270.92%* creat_clo-25 14973.05 ( 0.00%) 56947.61 * 280.33%* creat_clo-27 16436.69 ( 0.00%) 65008.03 * 295.51%* creat_clo-29 13949.01 ( 0.00%) 69047.62 * 395.00%* creat_clo-31 14283.52 ( 0.00%) 67982.45 * 375.95%* Reviewed-by: Theodore Ts'o <tytso@mit.edu> Reviewed-by: Lukas Czerner <lczerner@redhat.com> Signed-off-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20210816095713.16537-5-jack@suse.cz Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Diffstat (limited to 'fs/ext4/orphan.c')
-rw-r--r--fs/ext4/orphan.c77
1 files changed, 52 insertions, 25 deletions
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
index ac928f60234a..53adc8f570a3 100644
--- a/fs/ext4/orphan.c
+++ b/fs/ext4/orphan.c
@@ -10,16 +10,31 @@
static int ext4_orphan_file_add(handle_t *handle, struct inode *inode)
{
- int i, j;
+ int i, j, start;
struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info;
int ret = 0;
+ bool found = false;
__le32 *bdata;
int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb);
+ int looped = 0;
+
+ /*
+ * Find block with free orphan entry. Use CPU number for a naive hash
+ * for a search start in the orphan file
+ */
+ start = raw_smp_processor_id()*13 % oi->of_blocks;
+ i = start;
+ do {
+ if (atomic_dec_if_positive(&oi->of_binfo[i].ob_free_entries)
+ >= 0) {
+ found = true;
+ break;
+ }
+ if (++i >= oi->of_blocks)
+ i = 0;
+ } while (i != start);
- spin_lock(&oi->of_lock);
- for (i = 0; i < oi->of_blocks && !oi->of_binfo[i].ob_free_entries; i++);
- if (i == oi->of_blocks) {
- spin_unlock(&oi->of_lock);
+ if (!found) {
/*
* For now we don't grow or shrink orphan file. We just use
* whatever was allocated at mke2fs time. The additional
@@ -28,28 +43,43 @@ static int ext4_orphan_file_add(handle_t *handle, struct inode *inode)
*/
return -ENOSPC;
}
- oi->of_binfo[i].ob_free_entries--;
- spin_unlock(&oi->of_lock);
- /*
- * Get access to orphan block. We have dropped of_lock but since we
- * have decremented number of free entries we are guaranteed free entry
- * in our block.
- */
ret = ext4_journal_get_write_access(handle, inode->i_sb,
oi->of_binfo[i].ob_bh, EXT4_JTR_ORPHAN_FILE);
- if (ret)
+ if (ret) {
+ atomic_inc(&oi->of_binfo[i].ob_free_entries);
return ret;
+ }
bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
- spin_lock(&oi->of_lock);
/* Find empty slot in a block */
- for (j = 0; j < inodes_per_ob && bdata[j]; j++);
- BUG_ON(j == inodes_per_ob);
- bdata[j] = cpu_to_le32(inode->i_ino);
+ j = 0;
+ do {
+ if (looped) {
+ /*
+ * Did we walk through the block several times without
+ * finding free entry? It is theoretically possible
+ * if entries get constantly allocated and freed or
+ * if the block is corrupted. Avoid indefinite looping
+ * and bail. We'll use orphan list instead.
+ */
+ if (looped > 3) {
+ atomic_inc(&oi->of_binfo[i].ob_free_entries);
+ return -ENOSPC;
+ }
+ cond_resched();
+ }
+ while (bdata[j]) {
+ if (++j >= inodes_per_ob) {
+ j = 0;
+ looped++;
+ }
+ }
+ } while (cmpxchg(&bdata[j], (__le32)0, cpu_to_le32(inode->i_ino)) !=
+ (__le32)0);
+
EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j;
ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
- spin_unlock(&oi->of_lock);
return ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[i].ob_bh);
}
@@ -180,10 +210,8 @@ static int ext4_orphan_file_del(handle_t *handle, struct inode *inode)
goto out;
bdata = (__le32 *)(oi->of_binfo[blk].ob_bh->b_data);
- spin_lock(&oi->of_lock);
bdata[off] = 0;
- oi->of_binfo[blk].ob_free_entries++;
- spin_unlock(&oi->of_lock);
+ atomic_inc(&oi->of_binfo[blk].ob_free_entries);
ret = ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[blk].ob_bh);
out:
ext4_clear_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
@@ -552,8 +580,6 @@ int ext4_init_orphan_info(struct super_block *sb)
struct ext4_orphan_block_tail *ot;
ino_t orphan_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_orphan_file_inum);
- spin_lock_init(&oi->of_lock);
-
if (!ext4_has_feature_orphan_file(sb))
return 0;
@@ -597,7 +623,7 @@ int ext4_init_orphan_info(struct super_block *sb)
for (j = 0; j < inodes_per_ob; j++)
if (bdata[j] == 0)
free++;
- oi->of_binfo[i].ob_free_entries = free;
+ atomic_set(&oi->of_binfo[i].ob_free_entries, free);
}
iput(inode);
return 0;
@@ -619,7 +645,8 @@ int ext4_orphan_file_empty(struct super_block *sb)
if (!ext4_has_feature_orphan_file(sb))
return 1;
for (i = 0; i < oi->of_blocks; i++)
- if (oi->of_binfo[i].ob_free_entries != inodes_per_ob)
+ if (atomic_read(&oi->of_binfo[i].ob_free_entries) !=
+ inodes_per_ob)
return 0;
return 1;
}