From 567f3e9a70d71e5c9be03701b8578be77857293b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 14 Nov 2009 08:19:05 -0500 Subject: ext4: plug a buffer_head leak in an error path of ext4_iget() One of the invalid error paths in ext4_iget() forgot to brelse() the inode buffer head. Fix it by adding a brelse() in the common error return path, which also simplifies function. Thanks to Andi Kleen reporting the problem. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2c8caa51addb..554c6798597c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4781,7 +4781,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) struct ext4_iloc iloc; struct ext4_inode *raw_inode; struct ext4_inode_info *ei; - struct buffer_head *bh; struct inode *inode; long ret; int block; @@ -4793,11 +4792,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) return inode; ei = EXT4_I(inode); + iloc.bh = 0; ret = __ext4_get_inode_loc(inode, &iloc, 0); if (ret < 0) goto bad_inode; - bh = iloc.bh; raw_inode = ext4_raw_inode(&iloc); inode->i_mode = le16_to_cpu(raw_inode->i_mode); inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); @@ -4820,7 +4819,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (inode->i_mode == 0 || !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { /* this inode is deleted */ - brelse(bh); ret = -ESTALE; goto bad_inode; } @@ -4852,7 +4850,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > EXT4_INODE_SIZE(inode->i_sb)) { - brelse(bh); ret = -EIO; goto bad_inode; } @@ -4905,10 +4902,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) /* Validate block references which are part of inode */ ret = ext4_check_inode_blockref(inode); } - if (ret) { - brelse(bh); + if (ret) goto bad_inode; - } if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; @@ -4936,7 +4931,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } else { - brelse(bh); ret = -EIO; ext4_error(inode->i_sb, __func__, "bogus i_mode (%o) for inode=%lu", @@ -4949,6 +4943,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) return inode; bad_inode: + brelse(iloc.bh); iget_failed(inode); return ERR_PTR(ret); } -- cgit v1.2.3 From beac2da7565e42be59963824899825d0cc624295 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 23 Nov 2009 07:25:08 -0500 Subject: ext4: add tracepoint for ext4_forget() Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 1 + include/trace/events/ext4.h | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 554c6798597c..13de1dd751f5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -89,6 +89,7 @@ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, might_sleep(); + trace_ext4_forget(inode, is_metadata, blocknr); BUFFER_TRACE(bh, "enter"); jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index d09550bf3f95..b390e1fc4a7b 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -907,6 +907,32 @@ TRACE_EVENT(ext4_mballoc_free, __entry->result_len, __entry->result_logical) ); +TRACE_EVENT(ext4_forget, + TP_PROTO(struct inode *inode, int is_metadata, __u64 block), + + TP_ARGS(inode, is_metadata, block), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( umode_t, mode ) + __field( int, is_metadata ) + __field( __u64, block ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->mode = inode->i_mode; + __entry->is_metadata = is_metadata; + __entry->block = block; + ), + + TP_printk("dev %s ino %lu mode %d is_metadata %d block %llu", + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->mode, __entry->is_metadata, __entry->block) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 50689696867d95b38d9c7be640a311494a04fb86 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 23 Nov 2009 07:17:34 -0500 Subject: ext4: make sure directory and symlink blocks are revoked When an inode gets unlinked, the functions ext4_clear_blocks() and ext4_remove_blocks() call ext4_forget() for all the buffer heads corresponding to the deleted inode's data blocks. If the inode is a directory or a symlink, the is_metadata parameter must be non-zero so ext4_forget() will revoke them via jbd2_journal_revoke(). Otherwise, if these blocks are reused for a data file, and the system crashes before a journal checkpoint, the journal replay could end up corrupting these data blocks. Thanks to Curt Wohlgemuth for pointing out potential problems in this area. Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/extents.c | 2 +- fs/ext4/inode.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 715264b4bae4..74dcff84c3a8 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2074,7 +2074,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, ext_debug("free last %u blocks starting %llu\n", num, start); for (i = 0; i < num; i++) { bh = sb_find_get_block(inode->i_sb, start + i); - ext4_forget(handle, 0, inode, bh, start + i); + ext4_forget(handle, metadata, inode, bh, start + i); } ext4_free_blocks(handle, inode, start, num, metadata); } else if (from == le32_to_cpu(ex->ee_block) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 13de1dd751f5..c420aaba6e9c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4121,6 +4121,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, __le32 *last) { __le32 *p; + int is_metadata = S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode); + if (try_to_extend_transaction(handle, inode)) { if (bh) { BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); @@ -4151,11 +4153,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, *p = 0; tbh = sb_find_get_block(inode->i_sb, nr); - ext4_forget(handle, 0, inode, tbh, nr); + ext4_forget(handle, is_metadata, inode, tbh, nr); } } - ext4_free_blocks(handle, inode, block_to_free, count, 0); + ext4_free_blocks(handle, inode, block_to_free, count, is_metadata); } /** -- cgit v1.2.3 From 30c6e07a92ea4cb87160d32ffa9bce172576ae4c Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 15 Nov 2009 15:30:58 -0500 Subject: ext4: fix i_flags access in ext4_da_writepages_trans_blocks() We need to be testing the i_flags field in the ext4 specific portion of the inode, instead of the (confusingly aliased) i_flags field in the generic struct inode. Signed-off-by: Julia Lawall Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c420aaba6e9c..9c097489af89 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2789,7 +2789,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) * number of contiguous block. So we will limit * number of contiguous block to a sane value */ - if (!(inode->i_flags & EXT4_EXTENTS_FL) && + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) && (max_blocks > EXT4_MAX_TRANS_DATA)) max_blocks = EXT4_MAX_TRANS_DATA; -- cgit v1.2.3 From 1032988c71f3f85483b2b4319684d1205a704c02 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 15 Nov 2009 15:29:56 -0500 Subject: ext4: fix block validity checks so they work correctly with meta_bg The block validity checks used by ext4_data_block_valid() wasn't correctly written to check file systems with the meta_bg feature. Fix this. Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/block_validity.c | 2 +- fs/ext4/inode.c | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 50784ef07563..dc79b75d8f70 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -160,7 +160,7 @@ int ext4_setup_system_zone(struct super_block *sb) if (ext4_bg_has_super(sb, i) && ((i < 5) || ((i % flex_size) == 0))) add_system_zone(sbi, ext4_group_first_block_no(sb, i), - sbi->s_gdb_count + 1); + ext4_bg_num_gdb(sb, i) + 1); gdp = ext4_get_group_desc(sb, i, NULL); ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); if (ret) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9c097489af89..0c0ddc1401e4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4884,10 +4884,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ret = 0; if (ei->i_file_acl && - ((ei->i_file_acl < - (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + - EXT4_SB(sb)->s_gdb_count)) || - (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) { + !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { ext4_error(sb, __func__, "bad extended attribute block %llu in inode #%lu", ei->i_file_acl, inode->i_ino); -- cgit v1.2.3 From 2bba702d4f88d7b010ec37e2527b552588404ae7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 23 Nov 2009 07:24:48 -0500 Subject: ext4: fix error handling in ext4_ind_get_blocks() When an error happened in ext4_splice_branch we failed to notice that in ext4_ind_get_blocks and mapped the buffer anyway. Fix the problem by checking for error properly. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0c0ddc1401e4..3673ec7b1c98 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1022,7 +1022,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, if (!err) err = ext4_splice_branch(handle, inode, iblock, partial, indirect_blks, count); - else + if (err) goto cleanup; set_buffer_new(bh_result); -- cgit v1.2.3 From d6797d14b1640d088652c72508b529a3aea479e3 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 22 Nov 2009 20:52:12 -0500 Subject: ext4: move ext4_forget() to ext4_jbd2.c The ext4_forget() function better belongs in ext4_jbd2.c. This will allow us to do some cleanup of the ext4_journal_revoke() and ext4_journal_forget() functions, as well as giving us better error reporting since we can report the caller of ext4_forget() when things go wrong. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 2 -- fs/ext4/ext4_jbd2.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/ext4_jbd2.h | 7 +++++++ fs/ext4/inode.c | 53 -------------------------------------------------- 4 files changed, 63 insertions(+), 55 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 05ce38b981cb..57c4e03afa0a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1393,8 +1393,6 @@ extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); extern void ext4_mb_put_buddy_cache_lock(struct super_block *, ext4_group_t, int); /* inode.c */ -int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, - struct buffer_head *bh, ext4_fsblk_t blocknr); struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int, int *); struct buffer_head *ext4_bread(handle_t *, struct inode *, diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 6a9409920dee..913f85715433 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -4,6 +4,8 @@ #include "ext4_jbd2.h" +#include + int __ext4_journal_get_undo_access(const char *where, handle_t *handle, struct buffer_head *bh) { @@ -64,6 +66,60 @@ int __ext4_journal_revoke(const char *where, handle_t *handle, return err; } +/* + * The ext4 forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. + * + * "bh" may be NULL: a metadata block may have been freed from memory + * but there may still be a record of it in the journal, and that record + * still needs to be revoked. + * + * If the handle isn't valid we're not journaling, but we still need to + * call into ext4_journal_revoke() to put the buffer head. + */ +int __ext4_forget(const char *where, handle_t *handle, int is_metadata, + struct inode *inode, struct buffer_head *bh, + ext4_fsblk_t blocknr) +{ + int err; + + might_sleep(); + + trace_ext4_forget(inode, is_metadata, blocknr); + BUFFER_TRACE(bh, "enter"); + + jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " + "data mode %x\n", + bh, is_metadata, inode->i_mode, + test_opt(inode->i_sb, DATA_FLAGS)); + + /* Never use the revoke function if we are doing full data + * journaling: there is no need to, and a V1 superblock won't + * support it. Otherwise, only skip the revoke on un-journaled + * data blocks. */ + + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + (!is_metadata && !ext4_should_journal_data(inode))) { + if (bh) { + BUFFER_TRACE(bh, "call jbd2_journal_forget"); + return __ext4_journal_forget(where, handle, bh); + } + return 0; + } + + /* + * data!=journal && (is_metadata || should_journal_data(inode)) + */ + BUFFER_TRACE(bh, "call ext4_journal_revoke"); + err = __ext4_journal_revoke(where, handle, blocknr, bh); + if (err) + ext4_abort(inode->i_sb, __func__, + "error %d when attempting revoke", err); + BUFFER_TRACE(bh, "exit"); + return err; +} + int __ext4_journal_get_create_access(const char *where, handle_t *handle, struct buffer_head *bh) { diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index a2865980342f..dc0b34a903eb 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -139,6 +139,10 @@ int __ext4_journal_forget(const char *where, handle_t *handle, int __ext4_journal_revoke(const char *where, handle_t *handle, ext4_fsblk_t blocknr, struct buffer_head *bh); +int __ext4_forget(const char *where, handle_t *handle, int is_metadata, + struct inode *inode, struct buffer_head *bh, + ext4_fsblk_t blocknr); + int __ext4_journal_get_create_access(const char *where, handle_t *handle, struct buffer_head *bh); @@ -151,6 +155,9 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, __ext4_journal_get_write_access(__func__, (handle), (bh)) #define ext4_journal_revoke(handle, blocknr, bh) \ __ext4_journal_revoke(__func__, (handle), (blocknr), (bh)) +#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ + __ext4_forget(__func__, (handle), (is_metadata), (inode), (bh),\ + (block_nr)) #define ext4_journal_get_create_access(handle, bh) \ __ext4_journal_get_create_access(__func__, (handle), (bh)) #define ext4_journal_forget(handle, bh) \ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3673ec7b1c98..fa37f9504ece 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -70,59 +70,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); } -/* - * The ext4 forget function must perform a revoke if we are freeing data - * which has been journaled. Metadata (eg. indirect blocks) must be - * revoked in all cases. - * - * "bh" may be NULL: a metadata block may have been freed from memory - * but there may still be a record of it in the journal, and that record - * still needs to be revoked. - * - * If the handle isn't valid we're not journaling, but we still need to - * call into ext4_journal_revoke() to put the buffer head. - */ -int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, - struct buffer_head *bh, ext4_fsblk_t blocknr) -{ - int err; - - might_sleep(); - - trace_ext4_forget(inode, is_metadata, blocknr); - BUFFER_TRACE(bh, "enter"); - - jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " - "data mode %x\n", - bh, is_metadata, inode->i_mode, - test_opt(inode->i_sb, DATA_FLAGS)); - - /* Never use the revoke function if we are doing full data - * journaling: there is no need to, and a V1 superblock won't - * support it. Otherwise, only skip the revoke on un-journaled - * data blocks. */ - - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || - (!is_metadata && !ext4_should_journal_data(inode))) { - if (bh) { - BUFFER_TRACE(bh, "call jbd2_journal_forget"); - return ext4_journal_forget(handle, bh); - } - return 0; - } - - /* - * data!=journal && (is_metadata || should_journal_data(inode)) - */ - BUFFER_TRACE(bh, "call ext4_journal_revoke"); - err = ext4_journal_revoke(handle, blocknr, bh); - if (err) - ext4_abort(inode->i_sb, __func__, - "error %d when attempting revoke", err); - BUFFER_TRACE(bh, "exit"); - return err; -} - /* * Work out how many blocks we need to proceed with the next chunk of a * truncate transaction. -- cgit v1.2.3 From b7e57e7c2a41826e51fe060fae5158bfc7a04e81 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 22 Nov 2009 21:00:13 -0500 Subject: ext4: fold ext4_journal_forget() into ext4_forget() Convert the last two callers of ext4_journal_forget() to use ext4_forget() instead, and then fold ext4_journal_forget() into ext4_forget(). This reduces are code complexity and shortens our call stack. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_jbd2.c | 22 +++++----------------- fs/ext4/ext4_jbd2.h | 6 ------ fs/ext4/inode.c | 16 ++++++++++++++-- 3 files changed, 19 insertions(+), 25 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 92c88a8f734d..b57e5c711b6d 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -34,22 +34,6 @@ int __ext4_journal_get_write_access(const char *where, handle_t *handle, return err; } -int __ext4_journal_forget(const char *where, handle_t *handle, - struct buffer_head *bh) -{ - int err = 0; - - if (ext4_handle_valid(handle)) { - err = jbd2_journal_forget(handle, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, - handle, err); - } - else - bforget(bh); - return err; -} - /* * The ext4 forget function must perform a revoke if we are freeing data * which has been journaled. Metadata (eg. indirect blocks) must be @@ -93,7 +77,11 @@ int __ext4_forget(const char *where, handle_t *handle, int is_metadata, (!is_metadata && !ext4_should_journal_data(inode))) { if (bh) { BUFFER_TRACE(bh, "call jbd2_journal_forget"); - return __ext4_journal_forget(where, handle, bh); + err = jbd2_journal_forget(handle, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, + handle, err); + return err; } return 0; } diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index f9fb4bb69577..84bc98ab9f0d 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -127,10 +127,6 @@ int __ext4_journal_get_undo_access(const char *where, handle_t *handle, int __ext4_journal_get_write_access(const char *where, handle_t *handle, struct buffer_head *bh); -/* When called with an invalid handle, this will still do a put on the BH */ -int __ext4_journal_forget(const char *where, handle_t *handle, - struct buffer_head *bh); - int __ext4_forget(const char *where, handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr); @@ -150,8 +146,6 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, (block_nr)) #define ext4_journal_get_create_access(handle, bh) \ __ext4_journal_get_create_access(__func__, (handle), (bh)) -#define ext4_journal_forget(handle, bh) \ - __ext4_journal_forget(__func__, (handle), (bh)) #define ext4_handle_dirty_metadata(handle, inode, bh) \ __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh)) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fa37f9504ece..72c694323492 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -767,7 +767,13 @@ failed: /* Allocation failed, free what we already allocated */ for (i = 1; i <= n ; i++) { BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); - ext4_journal_forget(handle, branch[i].bh); + /* + * Note: is_metadata is 0 because branch[i].bh is + * newly allocated, so there is no need to revoke the + * block. If we do, it's harmless, but not necessary. + */ + ext4_forget(handle, 0, inode, branch[i].bh, + branch[i].bh->b_blocknr); } for (i = 0; i < indirect_blks; i++) ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); @@ -852,7 +858,13 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, err_out: for (i = 1; i <= num; i++) { BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); - ext4_journal_forget(handle, where[i].bh); + /* + * Note: is_metadata is 0 because branch[i].bh is + * newly allocated, so there is no need to revoke the + * block. If we do, it's harmless, but not necessary. + */ + ext4_forget(handle, 0, inode, where[i].bh, + where[i].bh->b_blocknr); ext4_free_blocks(handle, inode, le32_to_cpu(where[i-1].key), 1, 0); } -- cgit v1.2.3 From e6362609b6c71c5b802026be9cf263bbdd67a50e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 23 Nov 2009 07:17:05 -0500 Subject: ext4: call ext4_forget() from ext4_free_blocks() Add the facility for ext4_forget() to be called from ext4_free_blocks(). This simplifies the code in a large number of places, and centralizes most of the work of calling ext4_forget() into a single place. Also fix a bug in the extents migration code; it wasn't calling ext4_forget() when releasing the indirect blocks during the conversion. As a result, if the system cashed during or shortly after the extents migration, and the released indirect blocks get reused as data blocks, the journal replay would corrupt the data blocks. With this new patch, fixing this bug was as simple as adding the EXT4_FREE_BLOCKS_FORGET flags to the call to ext4_free_blocks(). Signed-off-by: "Theodore Ts'o" Cc: "Aneesh Kumar K.V" --- fs/ext4/ext4.h | 10 +++++-- fs/ext4/extents.c | 24 ++++++---------- fs/ext4/inode.c | 67 ++++++++++++++++++--------------------------- fs/ext4/mballoc.c | 49 ++++++++++++++++++++++++--------- fs/ext4/migrate.c | 23 +++++++++++----- fs/ext4/xattr.c | 8 ++++-- include/trace/events/ext4.h | 16 ++++++----- 7 files changed, 109 insertions(+), 88 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 210e1b53e91f..4cfc2f0edb3f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -375,6 +375,12 @@ struct ext4_new_group_data { #define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ EXT4_GET_BLOCKS_DIO_CREATE_EXT) +/* + * Flags used by ext4_free_blocks + */ +#define EXT4_FREE_BLOCKS_METADATA 0x0001 +#define EXT4_FREE_BLOCKS_FORGET 0x0002 + /* * ioctl commands */ @@ -1384,8 +1390,8 @@ extern void ext4_discard_preallocations(struct inode *); extern int __init init_ext4_mballoc(void); extern void exit_ext4_mballoc(void); extern void ext4_free_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t block, unsigned long count, - int metadata); + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 74dcff84c3a8..2c4a9321fb14 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1007,7 +1007,8 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; - ext4_free_blocks(handle, inode, ablocks[i], 1, 1); + ext4_free_blocks(handle, inode, 0, ablocks[i], 1, + EXT4_FREE_BLOCKS_METADATA); } } kfree(ablocks); @@ -1957,7 +1958,6 @@ errout: static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { - struct buffer_head *bh; int err; ext4_fsblk_t leaf; @@ -1973,9 +1973,8 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, if (err) return err; ext_debug("index is empty, remove it, free block %llu\n", leaf); - bh = sb_find_get_block(inode->i_sb, leaf); - ext4_forget(handle, 1, inode, bh, leaf); - ext4_free_blocks(handle, inode, leaf, 1, 1); + ext4_free_blocks(handle, inode, 0, leaf, 1, + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return err; } @@ -2042,12 +2041,11 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, struct ext4_extent *ex, ext4_lblk_t from, ext4_lblk_t to) { - struct buffer_head *bh; unsigned short ee_len = ext4_ext_get_actual_len(ex); - int i, metadata = 0; + int flags = EXT4_FREE_BLOCKS_FORGET; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - metadata = 1; + flags |= EXT4_FREE_BLOCKS_METADATA; #ifdef EXTENTS_STATS { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -2072,11 +2070,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, num = le32_to_cpu(ex->ee_block) + ee_len - from; start = ext_pblock(ex) + ee_len - num; ext_debug("free last %u blocks starting %llu\n", num, start); - for (i = 0; i < num; i++) { - bh = sb_find_get_block(inode->i_sb, start + i); - ext4_forget(handle, metadata, inode, bh, start + i); - } - ext4_free_blocks(handle, inode, start, num, metadata); + ext4_free_blocks(handle, inode, 0, start, num, flags); } else if (from == le32_to_cpu(ex->ee_block) && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", @@ -3319,8 +3313,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* not a good idea to call discard here directly, * but otherwise we'd need to call it every free() */ ext4_discard_preallocations(inode); - ext4_free_blocks(handle, inode, ext_pblock(&newex), - ext4_ext_get_actual_len(&newex), 0); + ext4_free_blocks(handle, inode, 0, ext_pblock(&newex), + ext4_ext_get_actual_len(&newex), 0); goto out2; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 72c694323492..3b28e1fbfc90 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -669,7 +669,7 @@ allocated: return ret; failed_out: for (i = 0; i < index; i++) - ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); return ret; } @@ -765,20 +765,20 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, return err; failed: /* Allocation failed, free what we already allocated */ + ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0); for (i = 1; i <= n ; i++) { - BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); /* - * Note: is_metadata is 0 because branch[i].bh is - * newly allocated, so there is no need to revoke the - * block. If we do, it's harmless, but not necessary. + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. */ - ext4_forget(handle, 0, inode, branch[i].bh, - branch[i].bh->b_blocknr); + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, + EXT4_FREE_BLOCKS_FORGET); } - for (i = 0; i < indirect_blks; i++) - ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); + for (i = n+1; i < indirect_blks; i++) + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); - ext4_free_blocks(handle, inode, new_blocks[i], num, 0); + ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0); return err; } @@ -857,18 +857,16 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, err_out: for (i = 1; i <= num; i++) { - BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); /* - * Note: is_metadata is 0 because branch[i].bh is - * newly allocated, so there is no need to revoke the - * block. If we do, it's harmless, but not necessary. + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. */ - ext4_forget(handle, 0, inode, where[i].bh, - where[i].bh->b_blocknr); - ext4_free_blocks(handle, inode, - le32_to_cpu(where[i-1].key), 1, 0); + ext4_free_blocks(handle, inode, where[i].bh, 0, 1, + EXT4_FREE_BLOCKS_FORGET); } - ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0); + ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key), + blks, 0); return err; } @@ -4080,7 +4078,10 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, __le32 *last) { __le32 *p; - int is_metadata = S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode); + int flags = EXT4_FREE_BLOCKS_FORGET; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + flags |= EXT4_FREE_BLOCKS_METADATA; if (try_to_extend_transaction(handle, inode)) { if (bh) { @@ -4096,27 +4097,10 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, } } - /* - * Any buffers which are on the journal will be in memory. We - * find them on the hash table so jbd2_journal_revoke() will - * run jbd2_journal_forget() on them. We've already detached - * each block from the file, so bforget() in - * jbd2_journal_forget() should be safe. - * - * AKPM: turn on bforget in jbd2_journal_forget()!!! - */ - for (p = first; p < last; p++) { - u32 nr = le32_to_cpu(*p); - if (nr) { - struct buffer_head *tbh; - - *p = 0; - tbh = sb_find_get_block(inode->i_sb, nr); - ext4_forget(handle, is_metadata, inode, tbh, nr); - } - } + for (p = first; p < last; p++) + *p = 0; - ext4_free_blocks(handle, inode, block_to_free, count, is_metadata); + ext4_free_blocks(handle, inode, 0, block_to_free, count, flags); } /** @@ -4304,7 +4288,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, blocks_for_truncate(inode)); } - ext4_free_blocks(handle, inode, nr, 1, 1); + ext4_free_blocks(handle, inode, 0, nr, 1, + EXT4_FREE_BLOCKS_METADATA); if (parent_bh) { /* diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 0dca90be1afb..78de5d3c5dce 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4436,8 +4436,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, * @metadata: Are these metadata blocks */ void ext4_free_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t block, unsigned long count, - int metadata) + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags) { struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; @@ -4454,15 +4454,12 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, int err = 0; int ret; - /* - * We need to make sure we don't reuse the freed block until - * after the transaction is committed, which we can do by - * treating the block as metadata, below. We make an - * exception if the inode is to be written in writeback mode - * since writeback mode has weak data consistency guarantees. - */ - if (!ext4_should_writeback_data(inode)) - metadata = 1; + if (bh) { + if (block) + BUG_ON(block != bh->b_blocknr); + else + block = bh->b_blocknr; + } sbi = EXT4_SB(sb); es = EXT4_SB(sb)->s_es; @@ -4476,7 +4473,32 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, } ext4_debug("freeing block %llu\n", block); - trace_ext4_free_blocks(inode, block, count, metadata); + trace_ext4_free_blocks(inode, block, count, flags); + + if (flags & EXT4_FREE_BLOCKS_FORGET) { + struct buffer_head *tbh = bh; + int i; + + BUG_ON(bh && (count > 1)); + + for (i = 0; i < count; i++) { + if (!bh) + tbh = sb_find_get_block(inode->i_sb, + block + i); + ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, + inode, tbh, block + i); + } + } + + /* + * We need to make sure we don't reuse the freed block until + * after the transaction is committed, which we can do by + * treating the block as metadata, below. We make an + * exception if the inode is to be written in writeback mode + * since writeback mode has weak data consistency guarantees. + */ + if (!ext4_should_writeback_data(inode)) + flags |= EXT4_FREE_BLOCKS_METADATA; ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); if (ac) { @@ -4552,7 +4574,8 @@ do_more: err = ext4_mb_load_buddy(sb, block_group, &e4b); if (err) goto error_return; - if (metadata && ext4_handle_valid(handle)) { + + if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) { struct ext4_free_data *new_entry; /* * blocks being freed are metadata. these blocks shouldn't diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index a93d5b80f3e2..d641e13e740e 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -262,13 +262,17 @@ static int free_dind_blocks(handle_t *handle, for (i = 0; i < max_entries; i++) { if (tmp_idata[i]) { extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, - le32_to_cpu(tmp_idata[i]), 1, 1); + ext4_free_blocks(handle, inode, 0, + le32_to_cpu(tmp_idata[i]), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); } } put_bh(bh); extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); + ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); return 0; } @@ -297,7 +301,9 @@ static int free_tind_blocks(handle_t *handle, } put_bh(bh); extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); + ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); return 0; } @@ -308,8 +314,10 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) /* ei->i_data[EXT4_IND_BLOCK] */ if (i_data[0]) { extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, - le32_to_cpu(i_data[0]), 1, 1); + ext4_free_blocks(handle, inode, 0, + le32_to_cpu(i_data[0]), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); } /* ei->i_data[EXT4_DIND_BLOCK] */ @@ -419,7 +427,8 @@ static int free_ext_idx(handle_t *handle, struct inode *inode, } put_bh(bh); extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, block, 1, 1); + ext4_free_blocks(handle, inode, 0, block, 1, + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return retval; } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 025701926f9a..910bf9a59cb3 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -482,9 +482,10 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, ea_bdebug(bh, "refcount now=0; freeing"); if (ce) mb_cache_entry_free(ce); - ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1); get_bh(bh); - ext4_forget(handle, 1, inode, bh, bh->b_blocknr); + ext4_free_blocks(handle, inode, bh, 0, 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); error = ext4_handle_dirty_metadata(handle, inode, bh); @@ -832,7 +833,8 @@ inserted: new_bh = sb_getblk(sb, block); if (!new_bh) { getblk_failed: - ext4_free_blocks(handle, inode, block, 1, 1); + ext4_free_blocks(handle, inode, 0, block, 1, + EXT4_FREE_BLOCKS_METADATA); error = -EIO; goto cleanup; } diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index b390e1fc4a7b..74f628bfdd1b 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -650,30 +650,32 @@ TRACE_EVENT(ext4_allocate_blocks, TRACE_EVENT(ext4_free_blocks, TP_PROTO(struct inode *inode, __u64 block, unsigned long count, - int metadata), + int flags), - TP_ARGS(inode, block, count, metadata), + TP_ARGS(inode, block, count, flags), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) + __field( umode_t, mode ) __field( __u64, block ) __field( unsigned long, count ) - __field( int, metadata ) - + __field( int, flags ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; + __entry->mode = inode->i_mode; __entry->block = block; __entry->count = count; - __entry->metadata = metadata; + __entry->flags = flags; ), - TP_printk("dev %s ino %lu block %llu count %lu metadata %d", + TP_printk("dev %s ino %lu mode 0%o block %llu count %lu flags %d", jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, - __entry->block, __entry->count, __entry->metadata) + __entry->mode, __entry->block, __entry->count, + __entry->flags) ); TRACE_EVENT(ext4_sync_file, -- cgit v1.2.3 From 3f0ca309858ee186435c608ee9eaafd1c8dcb53a Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 24 Nov 2009 11:15:44 -0500 Subject: ext4: remove unused parameter wbc from __ext4_journalled_writepage() CC: Jan Kara Signed-off-by: Wu Fengguang Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3b28e1fbfc90..d3f99e9e8a31 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2558,7 +2558,6 @@ static int bput_one(handle_t *handle, struct buffer_head *bh) } static int __ext4_journalled_writepage(struct page *page, - struct writeback_control *wbc, unsigned int len) { struct address_space *mapping = page->mapping; @@ -2716,7 +2715,7 @@ static int ext4_writepage(struct page *page, * doesn't seem much point in redirtying the page here. */ ClearPageChecked(page); - return __ext4_journalled_writepage(page, wbc, len); + return __ext4_journalled_writepage(page, len); } if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) -- cgit v1.2.3 From b9a4207d5e911b938f73079a83cc2ae10524ec7f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 8 Dec 2009 21:24:33 -0500 Subject: ext4: Avoid data / filesystem corruption when write fails to copy data When ext4_write_begin fails after allocating some blocks or generic_perform_write fails to copy data to write, we truncate blocks already instantiated beyond i_size. Although these blocks were never inside i_size, we have to truncate the pagecache of these blocks so that corresponding buffers get unmapped. Otherwise subsequent __block_prepare_write (called because we are retrying the write) will find the buffers mapped, not call ->get_block, and thus the page will be backed by already freed blocks leading to filesystem and data corruption. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d3f99e9e8a31..0e2ea572856c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1492,6 +1492,16 @@ static int do_journal_get_write_access(handle_t *handle, return ext4_journal_get_write_access(handle, bh); } +/* + * Truncate blocks that were not used by write. We have to truncate the + * pagecache as well so that corresponding buffers get properly unmapped. + */ +static void ext4_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext4_truncate(inode); +} + static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1557,7 +1567,7 @@ retry: ext4_journal_stop(handle); if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might * still be on the orphan list; we need to @@ -1667,7 +1677,7 @@ static int ext4_ordered_write_end(struct file *file, ret = ret2; if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode @@ -1709,7 +1719,7 @@ static int ext4_writeback_write_end(struct file *file, ret = ret2; if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode @@ -1772,7 +1782,7 @@ static int ext4_journalled_write_end(struct file *file, if (!ret) ret = ret2; if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode @@ -3048,7 +3058,7 @@ retry: * i_size_read because we hold i_mutex. */ if (pos + len > inode->i_size) - ext4_truncate(inode); + ext4_truncate_failed_write(inode); } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) -- cgit v1.2.3 From 8aa6790f876e81f5a2211fe1711a5fe3fe2d7b20 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Tue, 8 Dec 2009 22:41:52 -0500 Subject: ext4: ext4_get_reserved_space() must return bytes instead of blocks Signed-off-by: Dmitry Monakhov Reviewed-by: Eric Sandeen Acked-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0e2ea572856c..2da74f57a10b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1010,7 +1010,7 @@ qsize_t ext4_get_reserved_space(struct inode *inode) EXT4_I(inode)->i_reserved_meta_blocks; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - return total; + return (total << inode->i_blkbits); } /* * Calculate the number of metadata blocks need to reserve -- cgit v1.2.3 From 5aca07eb7d8f14d90c740834d15ca15277f4820c Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Tue, 8 Dec 2009 22:42:15 -0500 Subject: ext4: quota macros cleanup Currently all quota block reservation macros contains hard-coded "2" aka MAXQUOTAS value. This is no good because in some places it is not obvious to understand what does this digit represent. Let's introduce new macro with self descriptive name. Signed-off-by: Dmitry Monakhov Acked-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_jbd2.h | 8 ++++++-- fs/ext4/extents.c | 2 +- fs/ext4/inode.c | 2 +- fs/ext4/migrate.c | 4 ++-- fs/ext4/namei.c | 8 ++++---- 5 files changed, 14 insertions(+), 10 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 84bc98ab9f0d..2c2b262bd31b 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -49,7 +49,7 @@ #define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ EXT4_XATTR_TRANS_BLOCKS - 2 + \ - 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) /* * Define the number of metadata blocks we need to account to modify data. @@ -57,7 +57,7 @@ * This include super block, inode block, quota blocks and xattr blocks */ #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ - 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) /* Delete operations potentially hit one directory's namespace plus an * entire inode, plus arbitrary amounts of bitmap/indirection data. Be @@ -92,6 +92,7 @@ * but inode, sb and group updates are done only once */ #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0) + #define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0) #else @@ -99,6 +100,9 @@ #define EXT4_QUOTA_INIT_BLOCKS(sb) 0 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0 #endif +#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) int ext4_mark_iloc_dirty(handle_t *handle, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2c4a9321fb14..5967f18fd7e7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2161,7 +2161,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, correct_index = 1; credits += (ext_depth(inode)) + 1; } - credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); + credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); err = ext4_ext_truncate_extend_restart(handle, inode, credits); if (err) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2da74f57a10b..1b1b7d918114 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5175,7 +5175,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ - handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+ + handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); if (IS_ERR(handle)) { error = PTR_ERR(handle); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index d641e13e740e..81415814b00b 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -238,7 +238,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode) * So allocate a credit of 3. We may update * quota (user and group). */ - needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); + needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); if (ext4_journal_extend(handle, needed) != 0) retval = ext4_journal_restart(handle, needed); @@ -486,7 +486,7 @@ int ext4_ext_migrate(struct inode *inode) handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb) + EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + 1); if (IS_ERR(handle)) { retval = PTR_ERR(handle); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index fde08c919d12..17a17e10dd60 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1769,7 +1769,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode, retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1803,7 +1803,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1840,7 +1840,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2253,7 +2253,7 @@ static int ext4_symlink(struct inode *dir, retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 + - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); -- cgit v1.2.3 From 194074acacebc169ded90a4657193f5180015051 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Tue, 8 Dec 2009 22:42:28 -0500 Subject: ext4: fix incorrect block reservation on quota transfer. Inside ->setattr() call both ATTR_UID and ATTR_GID may be valid This means that we may end-up with transferring all quotas. Add we have to reserve QUOTA_DEL_BLOCKS for all quotas, as we do in case of QUOTA_INIT_BLOCKS. Signed-off-by: Dmitry Monakhov Reviewed-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1b1b7d918114..958c3ff800e9 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5176,7 +5176,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ - EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; -- cgit v1.2.3 From b436b9bef84de6893e86346d8fbf7104bc520645 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 8 Dec 2009 23:51:10 -0500 Subject: ext4: Wait for proper transaction commit on fsync We cannot rely on buffer dirty bits during fsync because pdflush can come before fsync is called and clear dirty bits without forcing a transaction commit. What we do is that we track which transaction has last changed the inode and which transaction last changed allocation and force it to disk on fsync. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 7 +++++++ fs/ext4/ext4_jbd2.h | 13 +++++++++++++ fs/ext4/extents.c | 14 ++++++++++++-- fs/ext4/fsync.c | 46 +++++++++++++++++----------------------------- fs/ext4/inode.c | 29 +++++++++++++++++++++++++++++ fs/ext4/super.c | 2 ++ 6 files changed, 80 insertions(+), 31 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4cfc2f0edb3f..ab31e65d46d0 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -709,6 +709,13 @@ struct ext4_inode_info { struct list_head i_aio_dio_complete_list; /* current io_end structure for async DIO write*/ ext4_io_end_t *cur_aio_dio; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; }; /* diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 2c2b262bd31b..05eca817d704 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -249,6 +249,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) return 0; } +static inline void ext4_update_inode_fsync_trans(handle_t *handle, + struct inode *inode, + int datasync) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + if (ext4_handle_valid(handle)) { + ei->i_sync_tid = handle->h_transaction->t_tid; + if (datasync) + ei->i_datasync_tid = handle->h_transaction->t_tid; + } +} + /* super.c */ int ext4_force_commit(struct super_block *sb); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 5967f18fd7e7..700206e525da 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3058,6 +3058,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { ret = ext4_convert_unwritten_extents_dio(handle, inode, path); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); goto out2; } /* buffered IO case */ @@ -3085,6 +3087,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ret = ext4_ext_convert_to_initialized(handle, inode, path, iblock, max_blocks); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); out: if (ret <= 0) { err = ret; @@ -3323,10 +3327,16 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, allocated = ext4_ext_get_actual_len(&newex); set_buffer_new(bh_result); - /* Cache only when it is _not_ an uninitialized extent */ - if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) + /* + * Cache the extent and update transaction to commit on fdatasync only + * when it is _not_ an uninitialized extent. + */ + if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { ext4_ext_put_in_cache(inode, iblock, allocated, newblock, EXT4_EXT_CACHE_EXTENT); + ext4_update_inode_fsync_trans(handle, inode, 1); + } else + ext4_update_inode_fsync_trans(handle, inode, 0); out: if (allocated > max_blocks) allocated = max_blocks; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index a3c25076aef1..0b22497d92e1 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -51,25 +51,30 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) { struct inode *inode = dentry->d_inode; + struct ext4_inode_info *ei = EXT4_I(inode); journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - int err, ret = 0; + int ret; + tid_t commit_tid; J_ASSERT(ext4_journal_current_handle() == NULL); trace_ext4_sync_file(file, dentry, datasync); + if (inode->i_sb->s_flags & MS_RDONLY) + return 0; + ret = flush_aio_dio_completed_IO(inode); if (ret < 0) return ret; + + if (!journal) + return simple_fsync(file, dentry, datasync); + /* - * data=writeback: + * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. - * sync_inode() will sync the metadata - * - * data=ordered: - * The caller's filemap_fdatawrite() will write the data and - * sync_inode() will write the inode if it is dirty. Then the caller's - * filemap_fdatawait() will wait on the pages. + * Metadata is in the journal, we wait for proper transaction to + * commit here. * * data=journal: * filemap_fdatawrite won't do anything (the buffers are clean). @@ -82,27 +87,10 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) if (ext4_should_journal_data(inode)) return ext4_force_commit(inode->i_sb); - if (!journal) - ret = sync_mapping_buffers(inode->i_mapping); - - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - goto out; - - /* - * The VFS has written the file data. If the inode is unaltered - * then we need not start a commit. - */ - if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* sys_fsync did this */ - }; - err = sync_inode(inode, &wbc); - if (ret == 0) - ret = err; - } -out: - if (journal && (journal->j_flags & JBD2_BARRIER)) + commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; + if (jbd2_log_start_commit(journal, commit_tid)) + jbd2_log_wait_commit(journal, commit_tid); + else if (journal->j_flags & JBD2_BARRIER) blkdev_issue_flush(inode->i_sb->s_bdev, NULL); return ret; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 958c3ff800e9..f1bc1e338828 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -983,6 +983,8 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, goto cleanup; set_buffer_new(bh_result); + + ext4_update_inode_fsync_trans(handle, inode, 1); got_it: map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); if (count > blocks_to_boundary) @@ -4738,6 +4740,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) struct ext4_inode *raw_inode; struct ext4_inode_info *ei; struct inode *inode; + journal_t *journal = EXT4_SB(sb)->s_journal; long ret; int block; @@ -4802,6 +4805,31 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); + /* + * Set transaction id's of transactions that have to be committed + * to finish f[data]sync. We set them to currently running transaction + * as we cannot be sure that the inode or some of its metadata isn't + * part of the transaction - the inode could have been reclaimed and + * now it is reread from disk. + */ + if (journal) { + transaction_t *transaction; + tid_t tid; + + spin_lock(&journal->j_state_lock); + if (journal->j_running_transaction) + transaction = journal->j_running_transaction; + else + transaction = journal->j_committing_transaction; + if (transaction) + tid = transaction->t_tid; + else + tid = journal->j_commit_sequence; + spin_unlock(&journal->j_state_lock); + ei->i_sync_tid = tid; + ei->i_datasync_tid = tid; + } + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > @@ -5056,6 +5084,7 @@ static int ext4_do_update_inode(handle_t *handle, err = rc; ei->i_state &= ~EXT4_STATE_NEW; + ext4_update_inode_fsync_trans(handle, inode, 0); out_brelse: brelse(bh); ext4_std_error(inode->i_sb, err); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8ab0c9518473..2b13dcfcf775 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -706,6 +706,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) spin_lock_init(&(ei->i_block_reservation_lock)); INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); ei->cur_aio_dio = NULL; + ei->i_sync_tid = 0; + ei->i_datasync_tid = 0; return &ei->vfs_inode; } -- cgit v1.2.3