From 05496769e5da83ce22ed97345afd9c7b71d6bd24 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 16 Sep 2008 14:36:17 -0400 Subject: jbd2: clean up how the journal device name is printed Calculate the journal device name once and stash it away in the journal_s structure. This avoids needing to call bdevname() everywhere and reduces stack usage by not needing to allocate an on-stack buffer. In addition, we eliminate the '/' that can appear in device names (e.g. "cciss/c0d0p9" --- see kernel bugzilla #11321) that can cause problems when creating proc directory names, and include the inode number to support ocfs2 which creates multiple journals with different inode numbers. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index f2ad061e95ec..b091e5378fe0 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -147,12 +147,9 @@ static int journal_submit_commit_record(journal_t *journal, * to remember if we sent a barrier request */ if (ret == -EOPNOTSUPP && barrier_done) { - char b[BDEVNAME_SIZE]; - printk(KERN_WARNING - "JBD: barrier-based sync failed on %s - " - "disabling barriers\n", - bdevname(journal->j_dev, b)); + "JBD: barrier-based sync failed on %s - " + "disabling barriers\n", journal->j_devname); spin_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_BARRIER; spin_unlock(&journal->j_state_lock); @@ -681,11 +678,9 @@ start_journal_io: */ err = journal_finish_inode_data_buffers(journal, commit_transaction); if (err) { - char b[BDEVNAME_SIZE]; - printk(KERN_WARNING "JBD2: Detected IO errors while flushing file data " - "on %s\n", bdevname(journal->j_fs_dev, b)); + "on %s\n", journal->j_devname); err = 0; } -- cgit v1.2.3 From ede86cc473defab74d778aeac14b19f43129d4d1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 5 Oct 2008 20:50:06 -0400 Subject: ext4: Add debugging markers that can be used by systemtap This debugging markers are designed to debug problems such as the random filesystem latency problems reported by Arjan. Signed-off-by: "Theodore Ts'o" --- fs/ext4/fsync.c | 5 +++++ fs/ext4/super.c | 2 ++ fs/jbd2/checkpoint.c | 3 +++ fs/jbd2/commit.c | 6 ++++++ 4 files changed, 16 insertions(+) (limited to 'fs/jbd2/commit.c') diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index c37d1e86f51a..5afe4370840b 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "ext4.h" #include "ext4_jbd2.h" @@ -51,6 +52,10 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) J_ASSERT(ext4_journal_current_handle() == NULL); + trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld", + inode->i_sb->s_id, datasync, inode->i_ino, + dentry->d_parent->d_inode->i_ino); + /* * data=writeback: * The caller's filemap_fdatawrite()/wait will sync the data. diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dfcd41fafb9f..9c0214689de0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -2951,6 +2952,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) { tid_t target; + trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); sb->s_dirt = 0; if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { if (wait) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index af4651bf3570..42895d369458 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -328,6 +329,8 @@ int jbd2_log_do_checkpoint(journal_t *journal) * journal straight away. */ result = jbd2_cleanup_journal_tail(journal); + trace_mark(jbd2_checkpoint, "dev %s need_checkpoint %d", + journal->j_devname, result); jbd_debug(1, "cleanup_journal_tail returned %d\n", result); if (result <= 0) return result; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index b091e5378fe0..e91f051a9859 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -368,6 +369,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) commit_transaction = journal->j_running_transaction; J_ASSERT(commit_transaction->t_state == T_RUNNING); + trace_mark(jbd2_start_commit, "dev %s transaction %d", + journal->j_devname, commit_transaction->t_tid); jbd_debug(1, "JBD: starting commit of transaction %d\n", commit_transaction->t_tid); @@ -985,6 +988,9 @@ restart_loop: } spin_unlock(&journal->j_list_lock); + trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", + journal->j_devname, commit_transaction->t_tid, + journal->j_tail_sequence); jbd_debug(1, "JBD: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); -- cgit v1.2.3 From 45a90bfd90c1215bf824c0f705b409723f52361b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 6 Oct 2008 12:04:02 -0400 Subject: jbd2: Fix buffer head leak when writing the commit block Also make sure the buffer heads are marked clean before submitting bh for writing. The previous code was marking the buffer head dirty, which would have forced an unneeded write (and seek) to the journal for no good reason. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index e91f051a9859..0d3814a35ed1 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -127,8 +127,7 @@ static int journal_submit_commit_record(journal_t *journal, JBUFFER_TRACE(descriptor, "submit commit block"); lock_buffer(bh); - get_bh(bh); - set_buffer_dirty(bh); + clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; @@ -158,7 +157,7 @@ static int journal_submit_commit_record(journal_t *journal, /* And try again, without the barrier */ lock_buffer(bh); set_buffer_uptodate(bh); - set_buffer_dirty(bh); + clear_buffer_dirty(bh); ret = submit_bh(WRITE, bh); } *cbh = bh; -- cgit v1.2.3 From 77e841de8abac4755cc83ca224fdf71418d65380 Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Sun, 12 Oct 2008 16:39:16 -0400 Subject: jbd2: abort when failed to log metadata buffers If we failed to write metadata buffers to the journal space and succeeded to write the commit record, stale data can be written back to the filesystem as metadata in the recovery phase. To avoid this, when we failed to write out metadata buffers, abort the journal before writing the commit record. We can also avoid this kind of corruption by using the journal checksum feature because it can detect invalid metadata blocks in the journal and avoid them from being replayed. So we don't need to care about asynchronous commit record writeout with a checksum. Signed-off-by: Hidehiro Kawai Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 0d3814a35ed1..78e4da934121 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -783,6 +783,9 @@ wait_for_iobuf: /* AKPM: bforget here */ } + if (err) + jbd2_journal_abort(journal, err); + jbd_debug(3, "JBD: commit phase 5\n"); if (!JBD2_HAS_INCOMPAT_FEATURE(journal, -- cgit v1.2.3 From 7ad7445f60fe4d46c4c9d2a9463db180d2a3b270 Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Fri, 10 Oct 2008 20:29:31 -0400 Subject: jbd2: don't dirty original metadata buffer on abort Currently, original metadata buffers are dirtied when they are unfiled whether the journal has aborted or not. Eventually these buffers will be written-back to the filesystem by pdflush. This means some metadata buffers are written to the filesystem without journaling if the journal aborts. So if both journal abort and system crash happen at the same time, the filesystem would become inconsistent state. Additionally, replaying journaled metadata can overwrite the latest metadata on the filesystem partly. Because, if the journal gets aborted, journaled metadata are preserved and replayed during the next mount not to lose uncheckpointed metadata. This would also break the consistency of the filesystem. This patch prevents original metadata buffers from being dirtied on abort by clearing BH_JBDDirty flag from those buffers. Thus, no metadata buffers are written to the filesystem without journaling. Signed-off-by: Hidehiro Kawai Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 78e4da934121..849f10496cea 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -504,9 +504,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) jh = commit_transaction->t_buffers; /* If we're in abort mode, we just un-journal the buffer and - release it for background writing. */ + release it. */ if (is_journal_aborted(journal)) { + clear_buffer_jbddirty(jh2bh(jh)); JBUFFER_TRACE(jh, "journal is aborting: refile"); jbd2_journal_refile_buffer(journal, jh); /* If that was the last one, we need to clean up @@ -884,6 +885,8 @@ restart_loop: if (buffer_jbddirty(bh)) { JBUFFER_TRACE(jh, "add to new checkpointing trans"); __jbd2_journal_insert_checkpoint(jh, commit_transaction); + if (is_journal_aborted(journal)) + clear_buffer_jbddirty(bh); JBUFFER_TRACE(jh, "refile for checkpoint writeback"); __jbd2_journal_refile_buffer(jh); jbd_unlock_bh_state(bh); -- cgit v1.2.3 From 5bf5683a33f3584da6eced480967c4f7e11515a8 Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Fri, 10 Oct 2008 22:12:43 -0400 Subject: ext4: add an option to control error handling on file data If the journal doesn't abort when it gets an IO error in file data blocks, the file data corruption will spread silently. Because most of applications and commands do buffered writes without fsync(), they don't notice the IO error. It's scary for mission critical systems. On the other hand, if the journal aborts whenever it gets an IO error in file data blocks, the system will easily become inoperable. So this patch introduces a filesystem option to determine whether it aborts the journal or just call printk() when it gets an IO error in file data. If you mount an ext4 fs with data_err=abort option, it aborts on file data write error. If you mount it with data_err=ignore, it doesn't abort, just call printk(). data_err=ignore is the default. Here is the corresponding patch of the ext3 version: http://kerneltrap.org/mailarchive/linux-kernel/2008/9/9/3239374 Signed-off-by: Hidehiro Kawai Signed-off-by: Theodore Ts'o --- Documentation/filesystems/ext4.txt | 5 +++++ fs/ext4/ext4.h | 2 ++ fs/ext4/super.c | 16 ++++++++++++++++ fs/jbd2/commit.c | 2 ++ include/linux/jbd2.h | 3 +++ 5 files changed, 28 insertions(+) (limited to 'fs/jbd2/commit.c') diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 74484e696405..eb154ef36c2a 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -223,6 +223,11 @@ errors=remount-ro(*) Remount the filesystem read-only on an error. errors=continue Keep going on a filesystem error. errors=panic Panic and halt the machine if an error occurs. +data_err=ignore(*) Just print an error message if an error occurs + in a file data buffer in ordered mode. +data_err=abort Abort the journal if an error occurs in a file + data buffer in ordered mode. + grpid Give objects the same group ID as their creator. bsdgroups diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f46a513a5157..6690a41cdd9f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -540,6 +540,8 @@ do { \ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ +#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ + /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 79bd3989e84f..014677b8e224 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -778,6 +778,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_printf(seq, ",inode_readahead_blks=%u", sbi->s_inode_readahead_blks); + if (test_opt(sb, DATA_ERR_ABORT)) + seq_puts(seq, ",data_err=abort"); + ext4_show_quota_options(seq, sb); return 0; } @@ -907,6 +910,7 @@ enum { Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, @@ -953,6 +957,8 @@ static match_table_t tokens = { {Opt_data_journal, "data=journal"}, {Opt_data_ordered, "data=ordered"}, {Opt_data_writeback, "data=writeback"}, + {Opt_data_err_abort, "data_err=abort"}, + {Opt_data_err_ignore, "data_err=ignore"}, {Opt_offusrjquota, "usrjquota="}, {Opt_usrjquota, "usrjquota=%s"}, {Opt_offgrpjquota, "grpjquota="}, @@ -1187,6 +1193,12 @@ static int parse_options(char *options, struct super_block *sb, sbi->s_mount_opt |= data_opt; } break; + case Opt_data_err_abort: + set_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + break; + case Opt_data_err_ignore: + clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + break; #ifdef CONFIG_QUOTA case Opt_usrjquota: qtype = USRQUOTA; @@ -2535,6 +2547,10 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_flags |= JBD2_BARRIER; else journal->j_flags &= ~JBD2_BARRIER; + if (test_opt(sb, DATA_ERR_ABORT)) + journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; + else + journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; spin_unlock(&journal->j_state_lock); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 849f10496cea..0abe02c4242a 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -684,6 +684,8 @@ start_journal_io: printk(KERN_WARNING "JBD2: Detected IO errors while flushing file data " "on %s\n", journal->j_devname); + if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR) + jbd2_journal_abort(journal, err); err = 0; } diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index c9e7d781db31..d2e91ea998fd 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -967,6 +967,9 @@ struct journal_s #define JBD2_FLUSHED 0x008 /* The journal superblock has been flushed */ #define JBD2_LOADED 0x010 /* The journal superblock has been loaded */ #define JBD2_BARRIER 0x020 /* Use IDE barriers */ +#define JBD2_ABORT_ON_SYNCDATA_ERR 0x040 /* Abort the journal on file + * data write error in ordered + * mode */ /* * Function declarations for the journaling transaction and buffer -- cgit v1.2.3