From 834224e81cdc265456f73fed748a349e43e2d8ef Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 8 May 2025 17:59:00 +0000 Subject: ext4: convert i_fc_lock to spinlock Convert ext4_inode_info->i_fc_lock to spinlock to avoid sleeping in invalid contexts. Reviewed-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://patch.msgid.link/20250508175908.1004880-2-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'fs/ext4/fast_commit.c') diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index da4263a14a20..63859ec6d91d 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -385,7 +385,7 @@ static int ext4_fc_track_template( int ret; tid = handle->h_transaction->t_tid; - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); if (tid == ei->i_sync_tid) { update = true; } else { @@ -393,8 +393,7 @@ static int ext4_fc_track_template( ei->i_sync_tid = tid; } ret = __fc_track_fn(handle, inode, args, update); - mutex_unlock(&ei->i_fc_lock); - + spin_unlock(&ei->i_fc_lock); if (!enqueue) return ret; @@ -428,19 +427,19 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - mutex_unlock(&ei->i_fc_lock); + spin_unlock(&ei->i_fc_lock); if (IS_ENCRYPTED(dir)) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, handle); - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); return -EOPNOTSUPP; } node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle); - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -471,7 +470,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); } spin_unlock(&sbi->s_fc_lock); - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); return 0; } @@ -893,15 +892,15 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) struct ext4_extent *ex; int ret; - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); if (ei->i_fc_lblk_len == 0) { - mutex_unlock(&ei->i_fc_lock); + spin_unlock(&ei->i_fc_lock); return 0; } old_blk_size = ei->i_fc_lblk_start; new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; ei->i_fc_lblk_len = 0; - mutex_unlock(&ei->i_fc_lock); + spin_unlock(&ei->i_fc_lock); cur_lblk_off = old_blk_size; ext4_debug("will try writing %d to %d for inode %ld\n", -- cgit v1.2.3 From 4d3266463ed06af2916b306bdb0ebd647726ba49 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 8 May 2025 17:59:01 +0000 Subject: ext4: for committing inode, make ext4_fc_track_inode wait If the inode that's being requested to track using ext4_fc_track_inode is being committed, then wait until the inode finishes the commit. Also, add calls to ext4_fc_track_inode at the right places. With this patch, now calling ext4_reserve_inode_write() results in inode being tracked for next fast commit. This ensures that by the time ext4_reserve_inode_write() returns, it is ready to be modified and won't be committed until the corresponding handle is open. A subtle lock ordering requirement with i_data_sem (which is documented in the code) requires that ext4_fc_track_inode() be called before grabbing i_data_sem. So, this patch also adds explicit ext4_fc_track_inode() calls in places where i_data_sem grabbed. Signed-off-by: Harshad Shirwadkar Link: https://patch.msgid.link/20250508175908.1004880-3-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 35 +++++++++++++++++++++++++++++++++++ fs/ext4/inline.c | 1 + fs/ext4/inode.c | 5 +++++ 3 files changed, 41 insertions(+) (limited to 'fs/ext4/fast_commit.c') diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 63859ec6d91d..c4d3c71d5e6c 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -12,6 +12,7 @@ #include "ext4_extents.h" #include "mballoc.h" +#include /* * Ext4 Fast Commits * ----------------- @@ -570,6 +571,8 @@ static int __track_inode(handle_t *handle, struct inode *inode, void *arg, void ext4_fc_track_inode(handle_t *handle, struct inode *inode) { + struct ext4_inode_info *ei = EXT4_I(inode); + wait_queue_head_t *wq; int ret; if (S_ISDIR(inode->i_mode)) @@ -587,6 +590,38 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode) if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; + if (!list_empty(&ei->i_fc_list)) + return; + + /* + * If we come here, we may sleep while waiting for the inode to + * commit. We shouldn't be holding i_data_sem when we go to sleep since + * the commit path needs to grab the lock while committing the inode. + */ + lockdep_assert_not_held(&ei->i_data_sem); + + while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { +#if (BITS_PER_LONG < 64) + DEFINE_WAIT_BIT(wait, &ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); +#else + DEFINE_WAIT_BIT(wait, &ei->i_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_flags, + EXT4_STATE_FC_COMMITTING); +#endif + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) + schedule(); + finish_wait(wq, &wait.wq_entry); + } + + /* + * From this point on, this inode will not be committed either + * by fast or full commit as long as the handle is open. + */ ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); trace_ext4_fc_track_inode(handle, inode, ret); } diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 2c9b762925c7..fbc1c84b555c 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -601,6 +601,7 @@ retry: goto out; } + ext4_fc_track_inode(handle, inode); ret = ext4_destroy_inline_data_nolock(handle, inode); if (ret) goto out; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 94c7d2d828a6..d58b99407390 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -696,6 +696,8 @@ found: if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) return retval; + + ext4_fc_track_inode(handle, inode); /* * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take @@ -4072,6 +4074,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) if (end_lblk > start_lblk) { ext4_lblk_t hole_len = end_lblk - start_lblk; + ext4_fc_track_inode(handle, inode); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); @@ -4224,6 +4227,7 @@ int ext4_truncate(struct inode *inode) if (err) goto out_stop; + ext4_fc_track_inode(handle, inode); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); @@ -5895,6 +5899,7 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, brelse(iloc->bh); iloc->bh = NULL; } + ext4_fc_track_inode(handle, inode); } ext4_std_error(inode->i_sb, err); return err; -- cgit v1.2.3 From 857d32f2618166765ce9306a246d0745afc76859 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 8 May 2025 17:59:03 +0000 Subject: ext4: rework fast commit commit path This patch reworks fast commit's commit path to remove locking the journal for the entire duration of a fast commit. Instead, we only lock the journal while marking all the eligible inodes as "committing". This allows handles to make progress in parallel with the fast commit. Signed-off-by: Harshad Shirwadkar Link: https://patch.msgid.link/20250508175908.1004880-5-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/fast_commit.c | 199 +++++++++++++++++++++++++++++++------------------- fs/jbd2/journal.c | 2 - 3 files changed, 126 insertions(+), 76 deletions(-) (limited to 'fs/ext4/fast_commit.c') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 79dfb57a7046..493d9ac7a577 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1916,6 +1916,7 @@ enum { EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */ + EXT4_STATE_FC_FLUSHING_DATA, /* Fast commit flushing data */ EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */ }; diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index c4d3c71d5e6c..a2cb4d965dc1 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -287,24 +287,55 @@ void ext4_fc_del(struct inode *inode) struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_fc_dentry_update *fc_dentry; + wait_queue_head_t *wq; if (ext4_fc_disabled(inode->i_sb)) return; -restart: spin_lock(&sbi->s_fc_lock); if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { spin_unlock(&sbi->s_fc_lock); return; } - if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { - ext4_fc_wait_committing_inode(inode); - goto restart; + /* + * Since ext4_fc_del is called from ext4_evict_inode while having a + * handle open, there is no need for us to wait here even if a fast + * commit is going on. That is because, if this inode is being + * committed, ext4_mark_inode_dirty would have waited for inode commit + * operation to finish before we come here. So, by the time we come + * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So, + * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode + * here. + * + * We may come here without any handles open in the "no_delete" case of + * ext4_evict_inode as well. However, if that happens, we first mark the + * file system as fast commit ineligible anyway. So, even in that case, + * it is okay to remove the inode from the fc list. + */ + WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING) + && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)); + while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { +#if (BITS_PER_LONG < 64) + DEFINE_WAIT_BIT(wait, &ei->i_state_flags, + EXT4_STATE_FC_FLUSHING_DATA); + wq = bit_waitqueue(&ei->i_state_flags, + EXT4_STATE_FC_FLUSHING_DATA); +#else + DEFINE_WAIT_BIT(wait, &ei->i_flags, + EXT4_STATE_FC_FLUSHING_DATA); + wq = bit_waitqueue(&ei->i_flags, + EXT4_STATE_FC_FLUSHING_DATA); +#endif + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { + spin_unlock(&sbi->s_fc_lock); + schedule(); + spin_lock(&sbi->s_fc_lock); + } + finish_wait(wq, &wait.wq_entry); } - - if (!list_empty(&ei->i_fc_list)) - list_del_init(&ei->i_fc_list); + list_del_init(&ei->i_fc_list); /* * Since this inode is getting removed, let's also remove all FC @@ -325,8 +356,6 @@ restart: release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); - - return; } /* @@ -590,9 +619,6 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode) if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; - if (!list_empty(&ei->i_fc_list)) - return; - /* * If we come here, we may sleep while waiting for the inode to * commit. We shouldn't be holding i_data_sem when we go to sleep since @@ -988,61 +1014,25 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) } -/* Submit data for all the fast commit inodes */ -static int ext4_fc_submit_inode_data_all(journal_t *journal) +/* Flushes data of all the inodes in the commit queue. */ +static int ext4_fc_flush_data(journal_t *journal) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; int ret = 0; - spin_lock(&sbi->s_fc_lock); list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { - ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); - while (atomic_read(&ei->i_fc_updates)) { - DEFINE_WAIT(wait); - - prepare_to_wait(&ei->i_fc_wait, &wait, - TASK_UNINTERRUPTIBLE); - if (atomic_read(&ei->i_fc_updates)) { - spin_unlock(&sbi->s_fc_lock); - schedule(); - spin_lock(&sbi->s_fc_lock); - } - finish_wait(&ei->i_fc_wait, &wait); - } - spin_unlock(&sbi->s_fc_lock); ret = jbd2_submit_inode_data(journal, ei->jinode); if (ret) return ret; - spin_lock(&sbi->s_fc_lock); } - spin_unlock(&sbi->s_fc_lock); - - return ret; -} - -/* Wait for completion of data for all the fast commit inodes */ -static int ext4_fc_wait_inode_data_all(journal_t *journal) -{ - struct super_block *sb = journal->j_private; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_inode_info *pos, *n; - int ret = 0; - spin_lock(&sbi->s_fc_lock); - list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { - if (!ext4_test_inode_state(&pos->vfs_inode, - EXT4_STATE_FC_COMMITTING)) - continue; - spin_unlock(&sbi->s_fc_lock); - - ret = jbd2_wait_inode_data(journal, pos->jinode); + list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ret = jbd2_wait_inode_data(journal, ei->jinode); if (ret) return ret; - spin_lock(&sbi->s_fc_lock); } - spin_unlock(&sbi->s_fc_lock); return 0; } @@ -1123,26 +1113,81 @@ static int ext4_fc_perform_commit(journal_t *journal) int ret = 0; u32 crc = 0; - ret = ext4_fc_submit_inode_data_all(journal); - if (ret) - return ret; + /* + * Step 1: Mark all inodes on s_fc_q[MAIN] with + * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being + * freed until the data flush is over. + */ + spin_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_set_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_FLUSHING_DATA); + } + spin_unlock(&sbi->s_fc_lock); + + /* Step 2: Flush data for all the eligible inodes. */ + ret = ext4_fc_flush_data(journal); - ret = ext4_fc_wait_inode_data_all(journal); + /* + * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning + * any error from step 2. This ensures that waiters waiting on + * EXT4_STATE_FC_FLUSHING_DATA can resume. + */ + spin_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_clear_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_FLUSHING_DATA); +#if (BITS_PER_LONG < 64) + wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA); +#else + wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA); +#endif + } + + /* + * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before + * the waiter checks the bit. Pairs with implicit barrier in + * prepare_to_wait() in ext4_fc_del(). + */ + smp_mb(); + spin_unlock(&sbi->s_fc_lock); + + /* + * If we encountered error in Step 2, return it now after clearing + * EXT4_STATE_FC_FLUSHING_DATA bit. + */ if (ret) return ret; + + /* Step 4: Mark all inodes as being committed. */ + jbd2_journal_lock_updates(journal); /* - * If file system device is different from journal device, issue a cache - * flush before we start writing fast commit blocks. + * The journal is now locked. No more handles can start and all the + * previous handles are now drained. We now mark the inodes on the + * commit queue as being committed. + */ + spin_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_set_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_COMMITTING); + } + spin_unlock(&sbi->s_fc_lock); + jbd2_journal_unlock_updates(journal); + + /* + * Step 5: If file system device is different from journal device, + * issue a cache flush before we start writing fast commit blocks. */ if (journal->j_fs_dev != journal->j_dev) blkdev_issue_flush(journal->j_fs_dev); blk_start_plug(&plug); + /* Step 6: Write fast commit blocks to disk. */ if (sbi->s_fc_bytes == 0) { /* - * Add a head tag only if this is the first fast commit - * in this TID. + * Step 6.1: Add a head tag only if this is the first fast + * commit in this TID. */ head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); head.fc_tid = cpu_to_le32( @@ -1154,6 +1199,7 @@ static int ext4_fc_perform_commit(journal_t *journal) } } + /* Step 6.2: Now write all the dentry updates. */ spin_lock(&sbi->s_fc_lock); ret = ext4_fc_commit_dentry_updates(journal, &crc); if (ret) { @@ -1161,6 +1207,7 @@ static int ext4_fc_perform_commit(journal_t *journal) goto out; } + /* Step 6.3: Now write all the changed inodes to disk. */ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { inode = &iter->vfs_inode; if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) @@ -1173,10 +1220,8 @@ static int ext4_fc_perform_commit(journal_t *journal) ret = ext4_fc_write_inode(inode, &crc); if (ret) goto out; - spin_lock(&sbi->s_fc_lock); } - spin_unlock(&sbi->s_fc_lock); - + /* Step 6.4: Finally write tail tag to conclude this fast commit. */ ret = ext4_fc_write_tail(sb, crc); out: @@ -1298,7 +1343,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_inode_info *iter, *iter_n; + struct ext4_inode_info *ei; struct ext4_fc_dentry_update *fc_dentry; if (full && sbi->s_fc_bh) @@ -1308,13 +1353,15 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) jbd2_fc_release_bufs(journal); spin_lock(&sbi->s_fc_lock); - list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], - i_fc_list) { - list_del_init(&iter->i_fc_list); - ext4_clear_inode_state(&iter->vfs_inode, + while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) { + ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN], + struct ext4_inode_info, + i_fc_list); + list_del_init(&ei->i_fc_list); + ext4_clear_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); - if (tid_geq(tid, iter->i_sync_tid)) { - ext4_fc_reset_inode(&iter->vfs_inode); + if (tid_geq(tid, ei->i_sync_tid)) { + ext4_fc_reset_inode(&ei->vfs_inode); } else if (full) { /* * We are called after a full commit, inode has been @@ -1325,15 +1372,19 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) * time in that case (and tid doesn't increase so * tid check above isn't reliable). */ - list_add_tail(&EXT4_I(&iter->vfs_inode)->i_fc_list, + list_add_tail(&ei->i_fc_list, &sbi->s_fc_q[FC_Q_STAGING]); } - /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ + /* + * Make sure clearing of EXT4_STATE_FC_COMMITTING is + * visible before we send the wakeup. Pairs with implicit + * barrier in prepare_to_wait() in ext4_fc_track_inode(). + */ smp_mb(); #if (BITS_PER_LONG < 64) - wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); + wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING); #else - wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); + wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING); #endif } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 743a1d7633cd..bfaa14bb1049 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -728,7 +728,6 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid) } journal->j_flags |= JBD2_FAST_COMMIT_ONGOING; write_unlock(&journal->j_state_lock); - jbd2_journal_lock_updates(journal); return 0; } @@ -742,7 +741,6 @@ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) { if (journal->j_fc_cleanup_callback) journal->j_fc_cleanup_callback(journal, 0, tid); - jbd2_journal_unlock_updates(journal); write_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; if (fallback) -- cgit v1.2.3 From ed45d331135c317c7f80e8c4e0dad644ca8ca9dc Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 8 May 2025 17:59:04 +0000 Subject: ext4: drop i_fc_updates from inode fc info The new logic introduced in this series does not require tracking number of active handles open on an inode. So, drop it. Signed-off-by: Harshad Shirwadkar Reviewed-by: Jan Kara Link: https://patch.msgid.link/20250508175908.1004880-6-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 5 ---- fs/ext4/fast_commit.c | 68 --------------------------------------------------- 2 files changed, 73 deletions(-) (limited to 'fs/ext4/fast_commit.c') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 493d9ac7a577..0cb34a06ee4c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1061,9 +1061,6 @@ struct ext4_inode_info { /* End of lblk range that needs to be committed in this fast commit */ ext4_lblk_t i_fc_lblk_len; - /* Number of ongoing updates on this inode */ - atomic_t i_fc_updates; - spinlock_t i_raw_lock; /* protects updates to the raw inode */ /* Fast commit wait queue for this inode */ @@ -2926,8 +2923,6 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode, void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); void ext4_fc_track_inode(handle_t *handle, struct inode *inode); void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle); -void ext4_fc_start_update(struct inode *inode); -void ext4_fc_stop_update(struct inode *inode); void ext4_fc_del(struct inode *inode); bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block); void ext4_fc_replay_cleanup(struct super_block *sb); diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index a2cb4d965dc1..f2e8a5f22260 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -202,32 +202,6 @@ void ext4_fc_init_inode(struct inode *inode) INIT_LIST_HEAD(&ei->i_fc_list); INIT_LIST_HEAD(&ei->i_fc_dilist); init_waitqueue_head(&ei->i_fc_wait); - atomic_set(&ei->i_fc_updates, 0); -} - -/* This function must be called with sbi->s_fc_lock held. */ -static void ext4_fc_wait_committing_inode(struct inode *inode) -__releases(&EXT4_SB(inode->i_sb)->s_fc_lock) -{ - wait_queue_head_t *wq; - struct ext4_inode_info *ei = EXT4_I(inode); - -#if (BITS_PER_LONG < 64) - DEFINE_WAIT_BIT(wait, &ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); -#else - DEFINE_WAIT_BIT(wait, &ei->i_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_flags, - EXT4_STATE_FC_COMMITTING); -#endif - lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); - schedule(); - finish_wait(wq, &wait.wq_entry); } static bool ext4_fc_disabled(struct super_block *sb) @@ -236,48 +210,6 @@ static bool ext4_fc_disabled(struct super_block *sb) (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)); } -/* - * Inform Ext4's fast about start of an inode update - * - * This function is called by the high level call VFS callbacks before - * performing any inode update. This function blocks if there's an ongoing - * fast commit on the inode in question. - */ -void ext4_fc_start_update(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - - if (ext4_fc_disabled(inode->i_sb)) - return; - -restart: - spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); - if (list_empty(&ei->i_fc_list)) - goto out; - - if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { - ext4_fc_wait_committing_inode(inode); - goto restart; - } -out: - atomic_inc(&ei->i_fc_updates); - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); -} - -/* - * Stop inode update and wake up waiting fast commits if any. - */ -void ext4_fc_stop_update(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - - if (ext4_fc_disabled(inode->i_sb)) - return; - - if (atomic_dec_and_test(&ei->i_fc_updates)) - wake_up_all(&ei->i_fc_wait); -} - /* * Remove inode from fast commit list. If the inode is being committed * we wait until inode commit is done. -- cgit v1.2.3 From 69f35ca189300ddba29a16214159beef45bbd984 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 8 May 2025 17:59:05 +0000 Subject: ext4: update code documentation This patch updates code documentation to reflect the commit path changes made in this series. Signed-off-by: Harshad Shirwadkar Reviewed-by: Jan Kara code docs Link: https://patch.msgid.link/20250508175908.1004880-7-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 48 +++++++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 17 deletions(-) (limited to 'fs/ext4/fast_commit.c') diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index f2e8a5f22260..06dda39326b4 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -50,19 +50,27 @@ * that need to be committed during a fast commit in another in memory queue of * inodes. During the commit operation, we commit in the following order: * - * [1] Lock inodes for any further data updates by setting COMMITTING state - * [2] Submit data buffers of all the inodes - * [3] Wait for [2] to complete - * [4] Commit all the directory entry updates in the fast commit space - * [5] Commit all the changed inode structures - * [6] Write tail tag (this tag ensures the atomicity, please read the following + * [1] Prepare all the inodes to write out their data by setting + * "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be + * deleted while it is being flushed. + * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA" + * state. + * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that + * all the exsiting handles finish and no new handles can start. + * [4] Mark all the fast commit eligible inodes as undergoing fast commit + * by setting "EXT4_STATE_FC_COMMITTING" state. + * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows + * starting of new handles. If new handles try to start an update on + * any of the inodes that are being committed, ext4_fc_track_inode() + * will block until those inodes have finished the fast commit. + * [6] Commit all the directory entry updates in the fast commit space. + * [7] Commit all the changed inodes in the fast commit space and clear + * "EXT4_STATE_FC_COMMITTING" for these inodes. + * [8] Write tail tag (this tag ensures the atomicity, please read the following * section for more details). - * [7] Wait for [4], [5] and [6] to complete. * - * All the inode updates must call ext4_fc_start_update() before starting an - * update. If such an ongoing update is present, fast commit waits for it to - * complete. The completion of such an update is marked by - * ext4_fc_stop_update(). + * All the inode updates must be enclosed within jbd2_jounrnal_start() + * and jbd2_journal_stop() similar to JBD2 journaling. * * Fast Commit Ineligibility * ------------------------- @@ -143,6 +151,13 @@ * similarly. Thus, by converting a non-idempotent procedure into a series of * idempotent outcomes, fast commits ensured idempotence during the replay. * + * Locking + * ------- + * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit + * dentry queue. ei->i_fc_lock protects the fast commit related info in a given + * inode. Most of the code avoids acquiring both the locks, but if one must do + * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock. + * * TODOs * ----- * @@ -157,13 +172,12 @@ * fast commit recovery even if that area is invalidated by later full * commits. * - * 1) Fast commit's commit path locks the entire file system during fast - * commit. This has significant performance penalty. Instead of that, we - * should use ext4_fc_start/stop_update functions to start inode level - * updates from ext4_journal_start/stop. Once we do that we can drop file - * system locking during commit path. + * 1) Handle more ineligible cases. * - * 2) Handle more ineligible cases. + * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent + * status tree. This would get rid of the need to call ext4_fc_track_inode() + * before acquiring i_data_sem. To do that we would need to ensure that + * modified extents from the extent status tree are not evicted from memory. */ #include -- cgit v1.2.3 From 86e07d4b9b0497afef78af773c74258c8f63030f Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 8 May 2025 17:59:06 +0000 Subject: ext4: temporarily elevate commit thread priority Unlike JBD2 based full commits, there is no dedicated journal thread for fast commits. Thus to reduce scheduling delays between IO submission and completion, temporarily elevate the committer thread's priority to match the configured priority of the JBD2 journal thread. Signed-off-by: Harshad Shirwadkar Reviewed-by: Jan Kara Link: https://patch.msgid.link/20250508175908.1004880-8-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 +++- fs/ext4/fast_commit.c | 13 +++++++++++++ fs/ext4/super.c | 5 ++--- 3 files changed, 18 insertions(+), 4 deletions(-) (limited to 'fs/ext4/fast_commit.c') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0cb34a06ee4c..3987c5bf2ff9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2296,10 +2296,12 @@ static inline int ext4_emergency_state(struct super_block *sb) #define EXT4_DEFM_NODELALLOC 0x0800 /* - * Default journal batch times + * Default journal batch times and ioprio. */ #define EXT4_DEF_MIN_BATCH_TIME 0 #define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ +#define EXT4_DEF_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) + /* * Default values for superblock update diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 06dda39326b4..5f6a8ec249b9 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1216,6 +1216,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) int subtid = atomic_read(&sbi->s_fc_subtid); int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; ktime_t start_time, commit_time; + int old_ioprio, journal_ioprio; if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return jbd2_complete_transaction(journal, commit_tid); @@ -1223,6 +1224,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) trace_ext4_fc_commit_start(sb, commit_tid); start_time = ktime_get(); + old_ioprio = get_current_ioprio(); restart_fc: ret = jbd2_fc_begin_commit(journal, commit_tid); @@ -1253,6 +1255,15 @@ restart_fc: goto fallback; } + /* + * Now that we know that this thread is going to do a fast commit, + * elevate the priority to match that of the journal thread. + */ + if (journal->j_task->io_context) + journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; + else + journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; + set_task_ioprio(current, journal_ioprio); fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; ret = ext4_fc_perform_commit(journal); if (ret < 0) { @@ -1267,6 +1278,7 @@ restart_fc: } atomic_inc(&sbi->s_fc_subtid); ret = jbd2_fc_end_commit(journal); + set_task_ioprio(current, old_ioprio); /* * weight the commit time higher than the average time so we * don't react too strongly to vast changes in the commit time @@ -1276,6 +1288,7 @@ restart_fc: return ret; fallback: + set_task_ioprio(current, old_ioprio); ret = jbd2_fc_end_commit_fallback(journal); ext4_fc_update_stats(sb, status, 0, 0, commit_tid); return ret; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ed8166fe2ad0..356a96269a21 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1809,7 +1809,6 @@ static const struct fs_parameter_spec ext4_param_specs[] = { {} }; -#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) #define MOPT_SET 0x0001 #define MOPT_CLEAR 0x0002 @@ -5255,7 +5254,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) /* Set defaults for the variables that will be set during parsing */ if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) - ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sectors_written_start = @@ -6495,7 +6494,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) ctx->journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; else - ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; } -- cgit v1.2.3 From 12e64e7f859ed19c5bb497866284d0318c3194a2 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 8 May 2025 17:59:07 +0000 Subject: ext4: convert s_fc_lock to mutex type This allows us to hold s_fc_lock during kmem_cache_* functions, which is needed in the following patch. Signed-off-by: Harshad Shirwadkar Reviewed-by: Jan Kara Link: https://patch.msgid.link/20250508175908.1004880-9-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 +- fs/ext4/fast_commit.c | 60 +++++++++++++++++++++++++-------------------------- fs/ext4/super.c | 2 +- 3 files changed, 32 insertions(+), 32 deletions(-) (limited to 'fs/ext4/fast_commit.c') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3987c5bf2ff9..052d7afeefaf 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1754,7 +1754,7 @@ struct ext4_sb_info { * following fields: * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh. */ - spinlock_t s_fc_lock; + struct mutex s_fc_lock; struct buffer_head *s_fc_bh; struct ext4_fc_stats s_fc_stats; tid_t s_fc_ineligible_tid; diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 5f6a8ec249b9..eb888e52261f 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -238,9 +238,9 @@ void ext4_fc_del(struct inode *inode) if (ext4_fc_disabled(inode->i_sb)) return; - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); return; } @@ -275,9 +275,9 @@ void ext4_fc_del(struct inode *inode) #endif prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); schedule(); - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); } finish_wait(wq, &wait.wq_entry); } @@ -288,7 +288,7 @@ void ext4_fc_del(struct inode *inode) * dentry create references, since it is not needed to log it anyways. */ if (list_empty(&ei->i_fc_dilist)) { - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); return; } @@ -298,7 +298,7 @@ void ext4_fc_del(struct inode *inode) list_del_init(&fc_dentry->fcd_dilist); WARN_ON(!list_empty(&ei->i_fc_dilist)); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); @@ -329,12 +329,12 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl has_transaction = false; read_unlock(&sbi->s_journal->j_state_lock); } - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid))) sbi->s_fc_ineligible_tid = tid; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } @@ -373,14 +373,14 @@ static int ext4_fc_track_template( if (!enqueue) return ret; - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); if (list_empty(&EXT4_I(inode)->i_fc_list)) list_add_tail(&EXT4_I(inode)->i_fc_list, (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? &sbi->s_fc_q[FC_Q_STAGING] : &sbi->s_fc_q[FC_Q_MAIN]); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); return ret; } @@ -424,7 +424,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, node->fcd_ino = inode->i_ino; take_dentry_name_snapshot(&node->fcd_name, dentry); INIT_LIST_HEAD(&node->fcd_dilist); - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) list_add_tail(&node->fcd_list, @@ -445,7 +445,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, WARN_ON(!list_empty(&ei->i_fc_dilist)); list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); } - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); spin_lock(&ei->i_fc_lock); return 0; @@ -1000,12 +1000,12 @@ __releases(&sbi->s_fc_lock) list_for_each_entry_safe(fc_dentry, fc_dentry_n, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { ret = -ENOSPC; goto lock_and_exit; } - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); continue; } /* @@ -1018,7 +1018,7 @@ __releases(&sbi->s_fc_lock) inode = &ei->vfs_inode; WARN_ON(inode->i_ino != fc_dentry->fcd_ino); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); /* * We first write the inode and then the create dirent. This @@ -1040,11 +1040,11 @@ __releases(&sbi->s_fc_lock) goto lock_and_exit; } - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); } return 0; lock_and_exit: - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); return ret; } @@ -1064,12 +1064,12 @@ static int ext4_fc_perform_commit(journal_t *journal) * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being * freed until the data flush is over. */ - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_set_inode_state(&iter->vfs_inode, EXT4_STATE_FC_FLUSHING_DATA); } - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); /* Step 2: Flush data for all the eligible inodes. */ ret = ext4_fc_flush_data(journal); @@ -1079,7 +1079,7 @@ static int ext4_fc_perform_commit(journal_t *journal) * any error from step 2. This ensures that waiters waiting on * EXT4_STATE_FC_FLUSHING_DATA can resume. */ - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_clear_inode_state(&iter->vfs_inode, EXT4_STATE_FC_FLUSHING_DATA); @@ -1096,7 +1096,7 @@ static int ext4_fc_perform_commit(journal_t *journal) * prepare_to_wait() in ext4_fc_del(). */ smp_mb(); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); /* * If we encountered error in Step 2, return it now after clearing @@ -1113,12 +1113,12 @@ static int ext4_fc_perform_commit(journal_t *journal) * previous handles are now drained. We now mark the inodes on the * commit queue as being committed. */ - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_set_inode_state(&iter->vfs_inode, EXT4_STATE_FC_COMMITTING); } - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); jbd2_journal_unlock_updates(journal); /* @@ -1146,10 +1146,10 @@ static int ext4_fc_perform_commit(journal_t *journal) } /* Step 6.2: Now write all the dentry updates. */ - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); ret = ext4_fc_commit_dentry_updates(journal, &crc); if (ret) { - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); goto out; } @@ -1159,7 +1159,7 @@ static int ext4_fc_perform_commit(journal_t *journal) if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) continue; - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); ret = ext4_fc_write_inode_data(inode, &crc); if (ret) goto out; @@ -1311,7 +1311,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) trace_ext4_fc_cleanup(journal, full, tid); jbd2_fc_release_bufs(journal); - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) { ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN], struct ext4_inode_info, @@ -1353,11 +1353,11 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) fcd_list); list_del_init(&fc_dentry->fcd_list); list_del_init(&fc_dentry->fcd_dilist); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); } list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], @@ -1372,7 +1372,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) if (full) sbi->s_fc_bytes = 0; - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); trace_ext4_fc_stats(sb); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 356a96269a21..5bd81dd9751c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4481,7 +4481,7 @@ static void ext4_fast_commit_init(struct super_block *sb) sbi->s_fc_bytes = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); sbi->s_fc_ineligible_tid = 0; - spin_lock_init(&sbi->s_fc_lock); + mutex_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); sbi->s_fc_replay_state.fc_regions = NULL; sbi->s_fc_replay_state.fc_regions_size = 0; -- cgit v1.2.3 From 6593714d67bab860a733d07895a94404f4ac3039 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 8 May 2025 17:59:08 +0000 Subject: ext4: hold s_fc_lock while during fast commit Leaving s_fc_lock in between during commit in ext4_fc_perform_commit() function leaves room for subtle concurrency bugs where ext4_fc_del() may delete an inode from the fast commit list, leaving list in an inconsistent state. Signed-off-by: Harshad Shirwadkar Reviewed-by: Jan Kara Link: https://patch.msgid.link/20250508175908.1004880-10-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 44 +++++++++++++------------------------------- 1 file changed, 13 insertions(+), 31 deletions(-) (limited to 'fs/ext4/fast_commit.c') diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index eb888e52261f..7ac672e35f08 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -424,6 +424,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, node->fcd_ino = inode->i_ino; take_dentry_name_snapshot(&node->fcd_name, dentry); INIT_LIST_HEAD(&node->fcd_dilist); + INIT_LIST_HEAD(&node->fcd_list); mutex_lock(&sbi->s_fc_lock); if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) @@ -985,8 +986,6 @@ static int ext4_fc_flush_data(journal_t *journal) /* Commit all the directory entry updates */ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) -__acquires(&sbi->s_fc_lock) -__releases(&sbi->s_fc_lock) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1000,26 +999,22 @@ __releases(&sbi->s_fc_lock) list_for_each_entry_safe(fc_dentry, fc_dentry_n, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { - mutex_unlock(&sbi->s_fc_lock); - if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { - ret = -ENOSPC; - goto lock_and_exit; - } - mutex_lock(&sbi->s_fc_lock); + if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) + return -ENOSPC; continue; } /* * With fcd_dilist we need not loop in sbi->s_fc_q to get the - * corresponding inode pointer + * corresponding inode. Also, the corresponding inode could have been + * deleted, in which case, we don't need to do anything. */ - WARN_ON(list_empty(&fc_dentry->fcd_dilist)); + if (list_empty(&fc_dentry->fcd_dilist)) + continue; ei = list_first_entry(&fc_dentry->fcd_dilist, struct ext4_inode_info, i_fc_dilist); inode = &ei->vfs_inode; WARN_ON(inode->i_ino != fc_dentry->fcd_ino); - mutex_unlock(&sbi->s_fc_lock); - /* * We first write the inode and then the create dirent. This * allows the recovery code to create an unnamed inode first @@ -1029,23 +1024,14 @@ __releases(&sbi->s_fc_lock) */ ret = ext4_fc_write_inode(inode, crc); if (ret) - goto lock_and_exit; - + return ret; ret = ext4_fc_write_inode_data(inode, crc); if (ret) - goto lock_and_exit; - - if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { - ret = -ENOSPC; - goto lock_and_exit; - } - - mutex_lock(&sbi->s_fc_lock); + return ret; + if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) + return -ENOSPC; } return 0; -lock_and_exit: - mutex_lock(&sbi->s_fc_lock); - return ret; } static int ext4_fc_perform_commit(journal_t *journal) @@ -1148,10 +1134,8 @@ static int ext4_fc_perform_commit(journal_t *journal) /* Step 6.2: Now write all the dentry updates. */ mutex_lock(&sbi->s_fc_lock); ret = ext4_fc_commit_dentry_updates(journal, &crc); - if (ret) { - mutex_unlock(&sbi->s_fc_lock); + if (ret) goto out; - } /* Step 6.3: Now write all the changed inodes to disk. */ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { @@ -1159,7 +1143,6 @@ static int ext4_fc_perform_commit(journal_t *journal) if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) continue; - mutex_unlock(&sbi->s_fc_lock); ret = ext4_fc_write_inode_data(inode, &crc); if (ret) goto out; @@ -1171,6 +1154,7 @@ static int ext4_fc_perform_commit(journal_t *journal) ret = ext4_fc_write_tail(sb, crc); out: + mutex_unlock(&sbi->s_fc_lock); blk_finish_plug(&plug); return ret; } @@ -1353,11 +1337,9 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) fcd_list); list_del_init(&fc_dentry->fcd_list); list_del_init(&fc_dentry->fcd_dilist); - mutex_unlock(&sbi->s_fc_lock); release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); - mutex_lock(&sbi->s_fc_lock); } list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], -- cgit v1.2.3 From 86b349ce0312a397a6961e457108556e44a3d211 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 23 Apr 2025 16:52:50 +0800 Subject: ext4: generalize EXT4_GET_BLOCKS_IO_SUBMIT flag usage Currently, the EXT4_GET_BLOCKS_IO_SUBMIT flag is only used during data writeback to indicate that in ordered mode, the journal commit thread should skip re-submitting data and simply wait for I/O completion. To prepare for later patches that need to detect I/O submission context in ext4_map_blocks(), generalizes the meaning of EXT4_GET_BLOCKS_IO_SUBMIT. This flag will be set during: 1) data I/O writeback, 2) I/O completion extents conversion, 3) journal performing commit in fast_commit. This change doesn't affect current usage of this flag and provides a clear way to identify I/O submission context. Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20250423085257.122685-3-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 13 ++++++++----- fs/ext4/fast_commit.c | 3 ++- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'fs/ext4/fast_commit.c') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 052d7afeefaf..84cbe8024f1e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -706,9 +706,6 @@ enum { #define EXT4_GET_BLOCKS_CONVERT 0x0010 #define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) - /* Convert extent to initialized after IO complete */ -#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ - EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) /* Eventual metadata allocation (due to growing extent tree) * should not fail, so try to use reserved blocks for that.*/ #define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 @@ -720,9 +717,15 @@ enum { #define EXT4_GET_BLOCKS_ZERO 0x0200 #define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ EXT4_GET_BLOCKS_ZERO) - /* Caller will submit data before dropping transaction handle. This - * allows jbd2 to avoid submitting data before commit. */ + /* Caller is in the context of data submission, such as writeback, + * fsync, etc. Especially, in the generic writeback path, caller will + * submit data before dropping transaction handle. This allows jbd2 + * to avoid submitting data before commit. */ #define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 + /* Convert extent to initialized after IO complete */ +#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT |\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |\ + EXT4_GET_BLOCKS_IO_SUBMIT) /* Caller is in the atomic contex, find extent if it has been cached */ #define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800 diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 7ac672e35f08..bfe5b3c40078 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -917,7 +917,8 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) while (cur_lblk_off <= new_blk_size) { map.m_lblk = cur_lblk_off; map.m_len = new_blk_size - cur_lblk_off + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); + ret = ext4_map_blocks(NULL, inode, &map, + EXT4_GET_BLOCKS_IO_SUBMIT); if (ret < 0) return -ECANCELED; -- cgit v1.2.3 From 402e38e6b71f5739119ca3107f375e112d63c7c5 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 23 Apr 2025 16:52:51 +0800 Subject: ext4: prevent stale extent cache entries caused by concurrent I/O writeback Currently, in the I/O writeback path, ext4_map_blocks() may attempt to cache additional unrelated extents in the extent status tree without holding the inode's i_rwsem and the mapping's invalidate_lock. This can lead to stale extent status entries remaining in certain scenarios, potentially causing data corruption. For example, when performing a collapse range in ext4_collapse_range(), it clears the extent cache and dirty pages before removing blocks and shifting extents. It also holds the i_data_sem during these two operations. However, both ext4_ext_remove_space() and ext4_ext_shift_extents() may briefly release the i_data_sem if journal credits are insufficient (ext4_datasem_ensure_credits()). If another writeback process writes dirty pages from other regions during this interval, it may cache extents that are about to be modified. Unless ext4_collapse_range() explicitly clears the extent cache again, these cached entries can become stale and inconsistent with the actual extents. 0 a n b c m | | | | | | [www][wwwwww][wwwwwwww]...[wwwww][wwww]... | | N M Assume that block a is dirty. The collapse range operation is removing data from n to m and drops i_data_sem immediately after removing the extent from b to c. At the same time, a concurrent writeback begins to write back block a; it will reloads the extent from [n, b) into the extent status tree since it does not hold the i_rwsem or the invalidate_lock. After the collapse range operation, it left the stale extent [n, b), which points logical block n to N, but the actual physical block of n should be M. Similarly, both ext4_insert_range() and ext4_truncate() have the same problem. ext4_punch_hole() survived since it re-add a hole extent entry after removing space since commit 9f1118223aa0 ("ext4: add a hole extent entry in cache after punch"). In most cases, during dirty page writeback, the block mapping information is likely to be found in the extent cache, making it less necessary to search for physical extents. Consequently, loading unrelated extent caches during writeback appears to be ineffective. Therefore, fix this by adds EXT4_EX_NOCACHE in the writeback path to prevent caching of unrelated extents, eliminating this potential source of corruption. Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20250423085257.122685-4-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/extents.c | 12 +++++++++--- fs/ext4/fast_commit.c | 3 ++- fs/ext4/inode.c | 28 ++++++++++++++++++++-------- 4 files changed, 32 insertions(+), 12 deletions(-) (limited to 'fs/ext4/fast_commit.c') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 84cbe8024f1e..3e724def1b06 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -741,6 +741,7 @@ enum { #define EXT4_EX_NOCACHE 0x40000000 #define EXT4_EX_FORCE_CACHE 0x20000000 #define EXT4_EX_NOFAIL 0x10000000 +#define EXT4_EX_FILTER 0x70000000 /* * Flags used by ext4_free_blocks diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d8eac736cc9a..8a5724b2dc51 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4202,7 +4202,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* find extent for this block */ - path = ext4_find_extent(inode, map->m_lblk, NULL, 0); + path = ext4_find_extent(inode, map->m_lblk, NULL, flags); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; @@ -4315,7 +4315,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, goto out; ar.lright = map->m_lblk; err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, - &ex2, 0); + &ex2, flags); if (err < 0) goto out; @@ -4820,8 +4820,14 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, break; } } + /* + * Do not cache any unrelated extents, as it does not hold the + * i_rwsem or invalidate_lock, which could corrupt the extent + * status tree. + */ ret = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_IO_CONVERT_EXT); + EXT4_GET_BLOCKS_IO_CONVERT_EXT | + EXT4_EX_NOCACHE); if (ret <= 0) ext4_warning(inode->i_sb, "inode #%lu: block %u: len %u: " diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index bfe5b3c40078..1392241de5e6 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -918,7 +918,8 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) map.m_lblk = cur_lblk_off; map.m_len = new_blk_size - cur_lblk_off + 1; ret = ext4_map_blocks(NULL, inode, &map, - EXT4_GET_BLOCKS_IO_SUBMIT); + EXT4_GET_BLOCKS_IO_SUBMIT | + EXT4_EX_NOCACHE); if (ret < 0) return -ECANCELED; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3005053e92a7..8c0d6fa58f26 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -463,15 +463,16 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, #endif /* ES_AGGRESSIVE_TEST */ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map) + struct ext4_map_blocks *map, int flags) { unsigned int status; int retval; + flags &= EXT4_EX_FILTER; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - retval = ext4_ext_map_blocks(handle, inode, map, 0); + retval = ext4_ext_map_blocks(handle, inode, map, flags); else - retval = ext4_ind_map_blocks(handle, inode, map, 0); + retval = ext4_ind_map_blocks(handle, inode, map, flags); if (retval <= 0) return retval; @@ -622,6 +623,13 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) return -EFSCORRUPTED; + /* + * Do not allow caching of unrelated ranges of extents during I/O + * submission. + */ + if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) + WARN_ON_ONCE(!(flags & EXT4_EX_NOCACHE)); + /* Lookup extent status tree firstly */ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) && ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { @@ -667,7 +675,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); - retval = ext4_map_query_blocks(handle, inode, map); + retval = ext4_map_query_blocks(handle, inode, map, flags); up_read((&EXT4_I(inode)->i_data_sem)); found: @@ -1807,7 +1815,7 @@ found: if (ext4_has_inline_data(inode)) retval = 0; else - retval = ext4_map_query_blocks(NULL, inode, map); + retval = ext4_map_query_blocks(NULL, inode, map, 0); up_read(&EXT4_I(inode)->i_data_sem); if (retval) return retval < 0 ? retval : 0; @@ -1830,7 +1838,7 @@ add_delayed: goto found; } } else if (!ext4_has_inline_data(inode)) { - retval = ext4_map_query_blocks(NULL, inode, map); + retval = ext4_map_query_blocks(NULL, inode, map, 0); if (retval) { up_write(&EXT4_I(inode)->i_data_sem); return retval < 0 ? retval : 0; @@ -2214,11 +2222,15 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) * previously reserved. However we must not fail because we're in * writeback and there is nothing we can do about it so it might result * in data loss. So use reserved blocks to allocate metadata if - * possible. + * possible. In addition, do not cache any unrelated extents, as it + * only holds the folio lock but does not hold the i_rwsem or + * invalidate_lock, which could corrupt the extent status tree. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL | - EXT4_GET_BLOCKS_IO_SUBMIT; + EXT4_GET_BLOCKS_IO_SUBMIT | + EXT4_EX_NOCACHE; + dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; -- cgit v1.2.3 From 6cbab5f95e49ec8a9f21784fae3ff0ee09b2dfbc Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 12 May 2025 22:38:06 -0700 Subject: ext4: remove sbi argument from ext4_chksum() Since ext4_chksum() no longer uses its sbi argument, remove it. Signed-off-by: Eric Biggers Reviewed-by: Baokun Li Link: https://patch.msgid.link/20250513053809.699974-2-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/bitmap.c | 8 ++++---- fs/ext4/ext4.h | 3 +-- fs/ext4/extents.c | 3 +-- fs/ext4/fast_commit.c | 10 +++++----- fs/ext4/ialloc.c | 5 ++--- fs/ext4/inode.c | 19 ++++++++----------- fs/ext4/ioctl.c | 4 ++-- fs/ext4/mmp.c | 2 +- fs/ext4/namei.c | 10 ++++------ fs/ext4/orphan.c | 13 ++++++------- fs/ext4/super.c | 13 ++++++------- fs/ext4/xattr.c | 10 +++++----- 12 files changed, 45 insertions(+), 55 deletions(-) (limited to 'fs/ext4/fast_commit.c') diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index a4dbaccee6e7..87760fabdd2e 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -30,7 +30,7 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, sz = EXT4_INODES_PER_GROUP(sb) >> 3; provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); - calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) { hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi); provided |= (hi << 16); @@ -52,7 +52,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb, return; sz = EXT4_INODES_PER_GROUP(sb) >> 3; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16); @@ -71,7 +71,7 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, return 1; provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); - calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) { hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi); provided |= (hi << 16); @@ -92,7 +92,7 @@ void ext4_block_bitmap_csum_set(struct super_block *sb, if (!ext4_has_feature_metadata_csum(sb)) return; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index cda06ed468ca..f414a1a16d9a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2494,8 +2494,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) #define DX_HASH_SIPHASH 6 #define DX_HASH_LAST DX_HASH_SIPHASH -static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, - const void *address, unsigned int length) +static inline u32 ext4_chksum(u32 crc, const void *address, unsigned int length) { return crc32c(crc, address, length); } diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ddc3a7529961..995be1051cfa 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -50,10 +50,9 @@ static __le32 ext4_extent_block_csum(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh, + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)eh, EXT4_EXTENT_TAIL_OFFSET(eh)); return cpu_to_le32(csum); } diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 1392241de5e6..42bee1d4f9f9 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -734,7 +734,7 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) tl.fc_len = cpu_to_le16(remaining); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); - *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize); + *crc = ext4_chksum(*crc, sbi->s_fc_bh->b_data, bsize); ext4_fc_submit_bh(sb, false); @@ -781,7 +781,7 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); dst += sizeof(tail.fc_tid); - crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data, + crc = ext4_chksum(crc, sbi->s_fc_bh->b_data, dst - (u8 *)sbi->s_fc_bh->b_data); tail.fc_crc = cpu_to_le32(crc); memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); @@ -2133,13 +2133,13 @@ static int ext4_fc_replay_scan(journal_t *journal, case EXT4_FC_TAG_INODE: case EXT4_FC_TAG_PAD: state->fc_cur_tag++; - state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, + state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; case EXT4_FC_TAG_TAIL: state->fc_cur_tag++; memcpy(&tail, val, sizeof(tail)); - state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, + state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + offsetof(struct ext4_fc_tail, fc_crc)); @@ -2166,7 +2166,7 @@ static int ext4_fc_replay_scan(journal_t *journal, break; } state->fc_cur_tag++; - state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, + state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; default: diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 4938e78cbadc..79aa3df8d019 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1288,10 +1288,9 @@ got: __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); - ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, - sizeof(gen)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 01e42cce572e..6c56735df4cb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -58,29 +58,27 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __u16 dummy_csum = 0; int offset = offsetof(struct ext4_inode, i_checksum_lo); unsigned int csum_size = sizeof(dummy_csum); - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset); - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)raw, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; - csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + csum = ext4_chksum(csum, (__u8 *)raw + offset, EXT4_GOOD_OLD_INODE_SIZE - offset); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { offset = offsetof(struct ext4_inode, i_checksum_hi); - csum = ext4_chksum(sbi, csum, (__u8 *)raw + - EXT4_GOOD_OLD_INODE_SIZE, + csum = ext4_chksum(csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE, offset - EXT4_GOOD_OLD_INODE_SIZE); if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; } - csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + csum = ext4_chksum(csum, (__u8 *)raw + offset, EXT4_INODE_SIZE(inode->i_sb) - offset); } @@ -4922,10 +4920,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = raw_inode->i_generation; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); - ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, - sizeof(gen)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } if ((!ext4_inode_csum_verify(inode, raw_inode, ei) || diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 0e240013c84d..bef9a148e433 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -354,8 +354,8 @@ void ext4_reset_inode_seed(struct inode *inode) if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); - ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } /* diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 3e26464b1425..51661570cf3b 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -14,7 +14,7 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) int offset = offsetof(struct mmp_struct, mmp_checksum); __u32 csum; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset); + csum = ext4_chksum(sbi->s_csum_seed, (char *)mmp, offset); return cpu_to_le32(csum); } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index e9712e64ec8f..a178ac229489 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -346,11 +346,10 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, static __le32 ext4_dirblock_csum(struct inode *inode, void *dirent, int size) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size); return cpu_to_le32(csum); } @@ -442,7 +441,6 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode, static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, int count_offset, int count, struct dx_tail *t) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; int size; @@ -450,9 +448,9 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, int offset = offsetof(struct dx_tail, dt_checksum); size = count_offset + (count * sizeof(struct dx_entry)); - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); - csum = ext4_chksum(sbi, csum, (__u8 *)t, offset); - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size); + csum = ext4_chksum(csum, (__u8 *)t, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); return cpu_to_le32(csum); } diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index c66e0cb29bd4..7c7f792ad6ab 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -541,9 +541,9 @@ static int ext4_orphan_file_block_csum_verify(struct super_block *sb, return 1; ot = ext4_orphan_block_tail(sb, bh); - calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, - (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - calculated = ext4_chksum(EXT4_SB(sb), calculated, (__u8 *)bh->b_data, + calculated = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr, + sizeof(dsk_block_nr)); + calculated = ext4_chksum(calculated, (__u8 *)bh->b_data, inodes_per_ob * sizeof(__u32)); return le32_to_cpu(ot->ob_checksum) == calculated; } @@ -560,10 +560,9 @@ void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers, struct ext4_orphan_block_tail *ot; __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); - csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, - (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - csum = ext4_chksum(EXT4_SB(sb), csum, (__u8 *)data, - inodes_per_ob * sizeof(__u32)); + csum = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr, + sizeof(dsk_block_nr)); + csum = ext4_chksum(csum, (__u8 *)data, inodes_per_ob * sizeof(__u32)); ot = ext4_orphan_block_tail(sb, bh); ot->ob_checksum = cpu_to_le32(csum); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7c8fdb311bad..b4e8b17cb908 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -289,11 +289,10 @@ static int ext4_verify_csum_type(struct super_block *sb, __le32 ext4_superblock_csum(struct super_block *sb, struct ext4_super_block *es) { - struct ext4_sb_info *sbi = EXT4_SB(sb); int offset = offsetof(struct ext4_super_block, s_checksum); __u32 csum; - csum = ext4_chksum(sbi, ~0, (char *)es, offset); + csum = ext4_chksum(~0, (char *)es, offset); return cpu_to_le32(csum); } @@ -3206,14 +3205,14 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, __u32 csum32; __u16 dummy_csum = 0; - csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, + csum32 = ext4_chksum(sbi->s_csum_seed, (__u8 *)&le_group, sizeof(le_group)); - csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset); - csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum, + csum32 = ext4_chksum(csum32, (__u8 *)gdp, offset); + csum32 = ext4_chksum(csum32, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); if (offset < sbi->s_desc_size) - csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset, + csum32 = ext4_chksum(csum32, (__u8 *)gdp + offset, sbi->s_desc_size - offset); crc = csum32 & 0xFFFF; @@ -4641,7 +4640,7 @@ static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_blo sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); else if (ext4_has_feature_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) - sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, + sbi->s_csum_seed = ext4_chksum(~0, es->s_uuid, sizeof(es->s_uuid)); return 0; } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 7ab8f2e8e815..8d15acbacc20 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -139,12 +139,12 @@ static __le32 ext4_xattr_block_csum(struct inode *inode, __u32 dummy_csum = 0; int offset = offsetof(struct ext4_xattr_header, h_checksum); - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr, + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset); - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); + csum = ext4_chksum(csum, (__u8 *)hdr, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); - csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset, + csum = ext4_chksum(csum, (__u8 *)hdr + offset, EXT4_BLOCK_SIZE(inode->i_sb) - offset); return cpu_to_le32(csum); @@ -348,7 +348,7 @@ xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, static u32 ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) { - return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size); + return ext4_chksum(sbi->s_csum_seed, buffer, size); } static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) -- cgit v1.2.3