From 229309caebe4508d650bb6d8f7d51f2b116f5bbd Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Sun, 8 May 2011 19:09:53 -0400 Subject: jbd2: Fix forever sleeping process in do_get_write_access() In do_get_write_access() we wait on BH_Unshadow bit for buffer to get from shadow state. The waking code in journal_commit_transaction() has a bug because it does not issue a memory barrier after the buffer is moved from the shadow state and before wake_up_bit() is called. Thus a waitqueue check can happen before the buffer is actually moved from the shadow state and waiting process may never be woken. Fix the problem by issuing proper barrier. Reported-by: Tao Ma <boyu.mt@taobao.com> Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/jbd2/commit.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 6e28000a4b21..78c299218681 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -760,8 +760,13 @@ wait_for_iobuf: required. */ JBUFFER_TRACE(jh, "file as BJ_Forget"); jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); - /* Wake up any transactions which were waiting for this - IO to complete */ + /* + * Wake up any transactions which were waiting for this IO to + * complete. The barrier must be here so that changes by + * jbd2_journal_file_buffer() take effect before wake_up_bit() + * does the waitqueue check. + */ + smp_mb(); wake_up_bit(&bh->b_state, BH_Unshadow); JBUFFER_TRACE(jh, "brelse shadowed buffer"); __brelse(bh); -- cgit v1.2.3 From 81be12c8179c1c397d3f179cdd9b3f7146cf47f1 Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Tue, 24 May 2011 11:52:40 -0400 Subject: jbd2: fix sending of data flush on journal commit In data=ordered mode, it's theoretically possible (however rare) that an inode is filed to transaction's t_inode_list and a flusher thread writes all the data and inode is reclaimed before the transaction starts to commit. In such a case, we could erroneously omit sending a flush to file system device when it is different from the journal device (because data can still be in disk cache only). Fix the problem by setting a flag in a transaction when some inode is added to it and then send disk flush in the commit code when the flag is set. Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/jbd2/commit.c | 3 +-- fs/jbd2/transaction.c | 7 +++++++ include/linux/jbd2.h | 4 +++- 3 files changed, 11 insertions(+), 3 deletions(-) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 78c299218681..2d5095ecc25f 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -219,7 +219,6 @@ static int journal_submit_data_buffers(journal_t *journal, ret = err; spin_lock(&journal->j_list_lock); J_ASSERT(jinode->i_transaction == commit_transaction); - commit_transaction->t_flushed_data_blocks = 1; clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); smp_mb__after_clear_bit(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); @@ -683,7 +682,7 @@ start_journal_io: * then we must flush the file system device before we issue * the commit record */ - if (commit_transaction->t_flushed_data_blocks && + if (commit_transaction->t_need_data_flush && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 85a055ef93fe..20065c9f2479 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2147,6 +2147,13 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) jinode->i_next_transaction == transaction) goto done; + /* + * We only ever set this variable to 1 so the test is safe. Since + * t_need_data_flush is likely to be set, we do the test to save some + * cacheline bouncing + */ + if (!transaction->t_need_data_flush) + transaction->t_need_data_flush = 1; /* On some different transaction's list - should be * the committing one */ if (jinode->i_transaction) { diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index a32dcaec04e1..4d57955061f4 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -658,7 +658,9 @@ struct transaction_s * waiting for it to finish. */ unsigned int t_synchronous_commit:1; - unsigned int t_flushed_data_blocks:1; + + /* Disk flush needs to be sent to fs partition [no locking] */ + int t_need_data_flush; /* * For use by the filesystem to store fs-specific data -- cgit v1.2.3 From bbd2be36910728f485ac78ea36e0f4f5a38e691e Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Tue, 24 May 2011 11:59:18 -0400 Subject: jbd2: Add function jbd2_trans_will_send_data_barrier() Provide a function which returns whether a transaction with given tid will send a flush to the filesystem device. The function will be used by ext4 to detect whether fsync needs to send a separate flush or not. Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/jbd2/commit.c | 10 +++++++++- fs/jbd2/journal.c | 41 +++++++++++++++++++++++++++++++++++++++++ include/linux/jbd2.h | 4 +++- 3 files changed, 53 insertions(+), 2 deletions(-) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 2d5095ecc25f..5b506e53c70b 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -677,6 +677,10 @@ start_journal_io: err = 0; } + write_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_COMMIT); + commit_transaction->t_state = T_COMMIT_DFLUSH; + write_unlock(&journal->j_state_lock); /* * If the journal is not located on the file system device, * then we must flush the file system device before we issue @@ -804,6 +808,10 @@ wait_for_iobuf: jbd2_journal_abort(journal, err); jbd_debug(3, "JBD: commit phase 5\n"); + write_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH); + commit_transaction->t_state = T_COMMIT_JFLUSH; + write_unlock(&journal->j_state_lock); if (!JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { @@ -959,7 +967,7 @@ restart_loop: jbd_debug(3, "JBD: commit phase 7\n"); - J_ASSERT(commit_transaction->t_state == T_COMMIT); + J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH); commit_transaction->t_start = jiffies; stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging, diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index cd2d341f602e..9a7826990304 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -587,6 +587,47 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) return ret; } +/* + * Return 1 if a given transaction has not yet sent barrier request + * connected with a transaction commit. If 0 is returned, transaction + * may or may not have sent the barrier. Used to avoid sending barrier + * twice in common cases. + */ +int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid) +{ + int ret = 0; + transaction_t *commit_trans; + + if (!(journal->j_flags & JBD2_BARRIER)) + return 0; + read_lock(&journal->j_state_lock); + /* Transaction already committed? */ + if (tid_geq(journal->j_commit_sequence, tid)) + goto out; + commit_trans = journal->j_committing_transaction; + if (!commit_trans || commit_trans->t_tid != tid) { + ret = 1; + goto out; + } + /* + * Transaction is being committed and we already proceeded to + * submitting a flush to fs partition? + */ + if (journal->j_fs_dev != journal->j_dev) { + if (!commit_trans->t_need_data_flush || + commit_trans->t_state >= T_COMMIT_DFLUSH) + goto out; + } else { + if (commit_trans->t_state >= T_COMMIT_JFLUSH) + goto out; + } + ret = 1; +out: + read_unlock(&journal->j_state_lock); + return ret; +} +EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier); + /* * Wait for a specified commit to complete. * The caller may not hold the journal lock. diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 4d57955061f4..4ecb7b16b278 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -529,9 +529,10 @@ struct transaction_s enum { T_RUNNING, T_LOCKED, - T_RUNDOWN, T_FLUSH, T_COMMIT, + T_COMMIT_DFLUSH, + T_COMMIT_JFLUSH, T_FINISHED } t_state; @@ -1230,6 +1231,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); int jbd2_journal_force_commit_nested(journal_t *journal); int jbd2_log_wait_commit(journal_t *journal, tid_t tid); int jbd2_log_do_checkpoint(journal_t *journal); +int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid); void __jbd2_log_wait_for_space(journal_t *journal); extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); -- cgit v1.2.3