diff options
Diffstat (limited to 'fs/ext4')
-rw-r--r-- | fs/ext4/ext4.h | 16 | ||||
-rw-r--r-- | fs/ext4/extents.c | 111 | ||||
-rw-r--r-- | fs/ext4/extents_status.c | 3 | ||||
-rw-r--r-- | fs/ext4/fast_commit.c | 210 | ||||
-rw-r--r-- | fs/ext4/fast_commit.h | 3 | ||||
-rw-r--r-- | fs/ext4/file.c | 43 | ||||
-rw-r--r-- | fs/ext4/ialloc.c | 9 | ||||
-rw-r--r-- | fs/ext4/inode.c | 54 | ||||
-rw-r--r-- | fs/ext4/ioctl.c | 11 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 317 | ||||
-rw-r--r-- | fs/ext4/mballoc.h | 1 | ||||
-rw-r--r-- | fs/ext4/migrate.c | 3 | ||||
-rw-r--r-- | fs/ext4/mmp.c | 2 | ||||
-rw-r--r-- | fs/ext4/move_extent.c | 26 | ||||
-rw-r--r-- | fs/ext4/namei.c | 23 | ||||
-rw-r--r-- | fs/ext4/readpage.c | 10 | ||||
-rw-r--r-- | fs/ext4/resize.c | 2 | ||||
-rw-r--r-- | fs/ext4/super.c | 1240 | ||||
-rw-r--r-- | fs/ext4/verity.c | 9 | ||||
-rw-r--r-- | fs/ext4/xattr.c | 1 |
20 files changed, 1140 insertions, 954 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9bca5565547b..8d5453852f98 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -167,8 +167,6 @@ enum SHIFT_DIRECTION { #define EXT4_MB_CR0_OPTIMIZED 0x8000 /* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ #define EXT4_MB_CR1_OPTIMIZED 0x00010000 -/* Perform linear traversal for one group */ -#define EXT4_MB_SEARCH_NEXT_LINEAR 0x00020000 struct ext4_allocation_request { /* target inode for block we're allocating */ struct inode *inode; @@ -1600,8 +1598,8 @@ struct ext4_sb_info { struct list_head s_discard_list; struct work_struct s_discard_work; atomic_t s_retry_alloc_pending; - struct rb_root s_mb_avg_fragment_size_root; - rwlock_t s_mb_rb_lock; + struct list_head *s_mb_avg_fragment_size; + rwlock_t *s_mb_avg_fragment_size_locks; struct list_head *s_mb_largest_free_orders; rwlock_t *s_mb_largest_free_orders_locks; @@ -2979,6 +2977,7 @@ extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, extern int ext4_write_inode(struct inode *, struct writeback_control *); extern int ext4_setattr(struct user_namespace *, struct dentry *, struct iattr *); +extern u32 ext4_dio_alignment(struct inode *inode); extern int ext4_getattr(struct user_namespace *, const struct path *, struct kstat *, u32, unsigned int); extern void ext4_evict_inode(struct inode *); @@ -3413,6 +3412,8 @@ struct ext4_group_info { ext4_grpblk_t bb_first_free; /* first free block */ ext4_grpblk_t bb_free; /* total free blocks */ ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ + int bb_avg_fragment_size_order; /* order of average + fragment in BG */ ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ ext4_group_t bb_group; /* Group number */ struct list_head bb_prealloc_list; @@ -3420,7 +3421,7 @@ struct ext4_group_info { void *bb_bitmap; #endif struct rw_semaphore alloc_sem; - struct rb_node bb_avg_fragment_size_rb; + struct list_head bb_avg_fragment_size_node; struct list_head bb_largest_free_order_node; ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block * regions, index is order. @@ -3591,9 +3592,6 @@ extern bool empty_inline_dir(struct inode *dir, int *has_inline_data); extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, struct ext4_dir_entry_2 **parent_de, int *retval); -extern int ext4_inline_data_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, - int *has_inline, __u64 start, __u64 len); extern void *ext4_read_inline_link(struct inode *inode); struct iomap; @@ -3712,7 +3710,7 @@ extern int ext4_ext_insert_extent(handle_t *, struct inode *, extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, struct ext4_ext_path **, int flags); -extern void ext4_ext_drop_refs(struct ext4_ext_path *); +extern void ext4_free_ext_path(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c148bb97b527..f1956288307f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -106,6 +106,25 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) return 0; } +static void ext4_ext_drop_refs(struct ext4_ext_path *path) +{ + int depth, i; + + if (!path) + return; + depth = path->p_depth; + for (i = 0; i <= depth; i++, path++) { + brelse(path->p_bh); + path->p_bh = NULL; + } +} + +void ext4_free_ext_path(struct ext4_ext_path *path) +{ + ext4_ext_drop_refs(path); + kfree(path); +} + /* * Make sure 'handle' has at least 'check_cred' credits. If not, restart * transaction with 'restart_cred' credits. The function drops i_data_sem @@ -460,6 +479,10 @@ static int __ext4_ext_check(const char *function, unsigned int line, error_msg = "invalid eh_entries"; goto corrupted; } + if (unlikely((eh->eh_entries == 0) && (depth > 0))) { + error_msg = "eh_entries is 0 but eh_depth is > 0"; + goto corrupted; + } if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) { error_msg = "invalid extent entries"; goto corrupted; @@ -632,8 +655,7 @@ int ext4_ext_precache(struct inode *inode) ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED); out: up_read(&ei->i_data_sem); - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); return ret; } @@ -720,19 +742,6 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, #define ext4_ext_show_move(inode, path, newblock, level) #endif -void ext4_ext_drop_refs(struct ext4_ext_path *path) -{ - int depth, i; - - if (!path) - return; - depth = path->p_depth; - for (i = 0; i <= depth; i++, path++) { - brelse(path->p_bh); - path->p_bh = NULL; - } -} - /* * ext4_ext_binsearch_idx: * binary search for the closest index of the given block @@ -951,8 +960,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, return path; err: - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); if (orig_path) *orig_path = NULL; return ERR_PTR(ret); @@ -2170,8 +2178,7 @@ merge: err = ext4_ext_dirty(handle, inode, path + path->p_depth); cleanup: - ext4_ext_drop_refs(npath); - kfree(npath); + ext4_free_ext_path(npath); return err; } @@ -3057,8 +3064,7 @@ again: } } out: - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); path = NULL; if (err == -EAGAIN) goto again; @@ -4371,8 +4377,7 @@ got_allocated_blocks: allocated = map->m_len; ext4_ext_show_leaf(inode, path); out: - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); @@ -5241,8 +5246,7 @@ again: break; } out: - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); return ret; } @@ -5534,15 +5538,13 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) EXT4_GET_BLOCKS_METADATA_NOFAIL); } - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); if (ret < 0) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } } else { - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); } ret = ext4_es_remove_extent(inode, offset_lblk, @@ -5762,10 +5764,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, count -= len; repeat: - ext4_ext_drop_refs(path1); - kfree(path1); - ext4_ext_drop_refs(path2); - kfree(path2); + ext4_free_ext_path(path1); + ext4_free_ext_path(path2); path1 = path2 = NULL; } return replaced_count; @@ -5844,8 +5844,7 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) } out: - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); return err ? err : mapped; } @@ -5912,8 +5911,7 @@ int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]); up_write(&EXT4_I(inode)->i_data_sem); out: - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); ext4_mark_inode_dirty(NULL, inode); return ret; } @@ -5931,8 +5929,7 @@ void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end) return; ex = path[path->p_depth].p_ext; if (!ex) { - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); ext4_mark_inode_dirty(NULL, inode); return; } @@ -5945,8 +5942,7 @@ void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end) ext4_ext_dirty(NULL, inode, &path[path->p_depth]); up_write(&EXT4_I(inode)->i_data_sem); ext4_mark_inode_dirty(NULL, inode); - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); } } @@ -5985,13 +5981,11 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) return PTR_ERR(path); ex = path[path->p_depth].p_ext; if (!ex) { - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); goto out; } end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); /* Count the number of data blocks */ cur = 0; @@ -6021,30 +6015,26 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) if (IS_ERR(path)) goto out; numblks += path->p_depth; - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); while (cur < end) { path = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path)) break; ex = path[path->p_depth].p_ext; if (!ex) { - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); return 0; } cur = max(cur + 1, le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)); ret = skip_hole(inode, &cur); if (ret < 0) { - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); break; } path2 = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path2)) { - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); break; } for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) { @@ -6058,10 +6048,8 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) if (cmp1 != cmp2 && cmp2 != 0) numblks++; } - ext4_ext_drop_refs(path); - ext4_ext_drop_refs(path2); - kfree(path); - kfree(path2); + ext4_free_ext_path(path); + ext4_free_ext_path(path2); } out: @@ -6088,13 +6076,11 @@ int ext4_ext_clear_bb(struct inode *inode) return PTR_ERR(path); ex = path[path->p_depth].p_ext; if (!ex) { - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); return 0; } end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); cur = 0; while (cur < end) { @@ -6113,8 +6099,7 @@ int ext4_ext_clear_bb(struct inode *inode) ext4_fc_record_regions(inode->i_sb, inode->i_ino, 0, path[j].p_block, 1, 1); } - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); } ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); ext4_fc_record_regions(inode->i_sb, inode->i_ino, diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 23167efda95e..cd0a861853e3 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -667,8 +667,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, } } out: - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); } static void ext4_es_insert_extent_ind_check(struct inode *inode, diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 2af962cbb835..ef05bfa87798 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -229,6 +229,12 @@ __releases(&EXT4_SB(inode->i_sb)->s_fc_lock) finish_wait(wq, &wait.wq_entry); } +static bool ext4_fc_disabled(struct super_block *sb) +{ + return (!test_opt2(sb, JOURNAL_FAST_COMMIT) || + (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)); +} + /* * Inform Ext4's fast about start of an inode update * @@ -240,8 +246,7 @@ void ext4_fc_start_update(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(inode->i_sb)) return; restart: @@ -265,8 +270,7 @@ void ext4_fc_stop_update(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(inode->i_sb)) return; if (atomic_dec_and_test(&ei->i_fc_updates)) @@ -283,8 +287,7 @@ void ext4_fc_del(struct inode *inode) struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_fc_dentry_update *fc_dentry; - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(inode->i_sb)) return; restart: @@ -337,8 +340,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl struct ext4_sb_info *sbi = EXT4_SB(sb); tid_t tid; - if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || - (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(sb)) return; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); @@ -493,10 +495,8 @@ void __ext4_fc_track_unlink(handle_t *handle, void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (sbi->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) @@ -522,10 +522,8 @@ void __ext4_fc_track_link(handle_t *handle, void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (sbi->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) @@ -551,10 +549,8 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode, void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (sbi->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) @@ -576,22 +572,20 @@ static int __track_inode(struct inode *inode, void *arg, bool update) void ext4_fc_track_inode(handle_t *handle, struct inode *inode) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; if (S_ISDIR(inode->i_mode)) return; + if (ext4_fc_disabled(inode->i_sb)) + return; + if (ext4_should_journal_data(inode)) { ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); return; } - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (sbi->s_mount_state & EXT4_FC_REPLAY)) - return; - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; @@ -634,15 +628,13 @@ static int __track_range(struct inode *inode, void *arg, bool update) void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct __track_range_args args; int ret; if (S_ISDIR(inode->i_mode)) return; - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (sbi->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) @@ -710,10 +702,10 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) * After allocating len, we should have space at least for a 0 byte * padding. */ - if (len + sizeof(struct ext4_fc_tl) > bsize) + if (len + EXT4_FC_TAG_BASE_LEN > bsize) return NULL; - if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) { + if (bsize - off - 1 > len + EXT4_FC_TAG_BASE_LEN) { /* * Only allocate from current buffer if we have enough space for * this request AND we have space to add a zero byte padding. @@ -730,10 +722,10 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) /* Need to add PAD tag */ tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off); tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); - pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl); + pad_len = bsize - off - 1 - EXT4_FC_TAG_BASE_LEN; tl->fc_len = cpu_to_le16(pad_len); if (crc) - *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl)); + *crc = ext4_chksum(sbi, *crc, tl, EXT4_FC_TAG_BASE_LEN); if (pad_len > 0) ext4_fc_memzero(sb, tl + 1, pad_len, crc); ext4_fc_submit_bh(sb, false); @@ -775,7 +767,7 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) * ext4_fc_reserve_space takes care of allocating an extra block if * there's no enough space on this block for accommodating this tail. */ - dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc); + dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc); if (!dst) return -ENOSPC; @@ -785,8 +777,8 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail)); sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); - ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc); - dst += sizeof(tl); + ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, &crc); + dst += EXT4_FC_TAG_BASE_LEN; tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc); dst += sizeof(tail.fc_tid); @@ -808,15 +800,15 @@ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, struct ext4_fc_tl tl; u8 *dst; - dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc); + dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc); if (!dst) return false; tl.fc_tag = cpu_to_le16(tag); tl.fc_len = cpu_to_le16(len); - ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); - ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc); + ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc); + ext4_fc_memcpy(sb, dst + EXT4_FC_TAG_BASE_LEN, val, len, crc); return true; } @@ -828,8 +820,8 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, struct ext4_fc_dentry_info fcd; struct ext4_fc_tl tl; int dlen = fc_dentry->fcd_name.len; - u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen, - crc); + u8 *dst = ext4_fc_reserve_space(sb, + EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc); if (!dst) return false; @@ -838,8 +830,8 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); - ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); - dst += sizeof(tl); + ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc); + dst += EXT4_FC_TAG_BASE_LEN; ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); dst += sizeof(fcd); ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc); @@ -874,22 +866,25 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc) tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); + ret = -ECANCELED; dst = ext4_fc_reserve_space(inode->i_sb, - sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc); + EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc); if (!dst) - return -ECANCELED; + goto err; - if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc)) - return -ECANCELED; - dst += sizeof(tl); + if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc)) + goto err; + dst += EXT4_FC_TAG_BASE_LEN; if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc)) - return -ECANCELED; + goto err; dst += sizeof(fc_inode); if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc), inode_len, crc)) - return -ECANCELED; - - return 0; + goto err; + ret = 0; +err: + brelse(iloc.bh); + return ret; } /* @@ -1343,7 +1338,7 @@ struct dentry_info_args { }; static inline void tl_to_darg(struct dentry_info_args *darg, - struct ext4_fc_tl *tl, u8 *val) + struct ext4_fc_tl *tl, u8 *val) { struct ext4_fc_dentry_info fcd; @@ -1352,8 +1347,14 @@ static inline void tl_to_darg(struct dentry_info_args *darg, darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); darg->ino = le32_to_cpu(fcd.fc_ino); darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); - darg->dname_len = le16_to_cpu(tl->fc_len) - - sizeof(struct ext4_fc_dentry_info); + darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info); +} + +static inline void ext4_fc_get_tl(struct ext4_fc_tl *tl, u8 *val) +{ + memcpy(tl, val, EXT4_FC_TAG_BASE_LEN); + tl->fc_len = le16_to_cpu(tl->fc_len); + tl->fc_tag = le16_to_cpu(tl->fc_tag); } /* Unlink replay function */ @@ -1491,13 +1492,15 @@ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) if (state->fc_modified_inodes[i] == ino) return 0; if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { - state->fc_modified_inodes = krealloc( - state->fc_modified_inodes, + int *fc_modified_inodes; + + fc_modified_inodes = krealloc(state->fc_modified_inodes, sizeof(int) * (state->fc_modified_inodes_size + EXT4_FC_REPLAY_REALLOC_INCREMENT), GFP_KERNEL); - if (!state->fc_modified_inodes) + if (!fc_modified_inodes) return -ENOMEM; + state->fc_modified_inodes = fc_modified_inodes; state->fc_modified_inodes_size += EXT4_FC_REPLAY_REALLOC_INCREMENT; } @@ -1516,7 +1519,7 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, struct ext4_inode *raw_fc_inode; struct inode *inode = NULL; struct ext4_iloc iloc; - int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag); + int inode_len, ino, ret, tag = tl->fc_tag; struct ext4_extent_header *eh; memcpy(&fc_inode, val, sizeof(fc_inode)); @@ -1541,7 +1544,7 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, if (ret) goto out; - inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode); + inode_len = tl->fc_len - sizeof(struct ext4_fc_inode); raw_inode = ext4_raw_inode(&iloc); memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); @@ -1682,15 +1685,18 @@ int ext4_fc_record_regions(struct super_block *sb, int ino, if (replay && state->fc_regions_used != state->fc_regions_valid) state->fc_regions_used = state->fc_regions_valid; if (state->fc_regions_used == state->fc_regions_size) { + struct ext4_fc_alloc_region *fc_regions; + + fc_regions = krealloc(state->fc_regions, + sizeof(struct ext4_fc_alloc_region) * + (state->fc_regions_size + + EXT4_FC_REPLAY_REALLOC_INCREMENT), + GFP_KERNEL); + if (!fc_regions) + return -ENOMEM; state->fc_regions_size += EXT4_FC_REPLAY_REALLOC_INCREMENT; - state->fc_regions = krealloc( - state->fc_regions, - state->fc_regions_size * - sizeof(struct ext4_fc_alloc_region), - GFP_KERNEL); - if (!state->fc_regions) - return -ENOMEM; + state->fc_regions = fc_regions; } region = &state->fc_regions[state->fc_regions_used++]; region->ino = ino; @@ -1770,8 +1776,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb, ret = ext4_ext_insert_extent( NULL, inode, &path, &newex, 0); up_write((&EXT4_I(inode)->i_data_sem)); - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); if (ret) goto out; goto next; @@ -1926,8 +1931,7 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) for (j = 0; j < path->p_depth; j++) ext4_mb_mark_bb(inode->i_sb, path[j].p_block, 1, 1); - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); } cur += ret; ext4_mb_mark_bb(inode->i_sb, map.m_pblk, @@ -1972,6 +1976,34 @@ void ext4_fc_replay_cleanup(struct super_block *sb) kfree(sbi->s_fc_replay_state.fc_modified_inodes); } +static inline bool ext4_fc_tag_len_isvalid(struct ext4_fc_tl *tl, + u8 *val, u8 *end) +{ + if (val + tl->fc_len > end) + return false; + + /* Here only check ADD_RANGE/TAIL/HEAD which will read data when do + * journal rescan before do CRC check. Other tags length check will + * rely on CRC check. + */ + switch (tl->fc_tag) { + case EXT4_FC_TAG_ADD_RANGE: + return (sizeof(struct ext4_fc_add_range) == tl->fc_len); + case EXT4_FC_TAG_TAIL: + return (sizeof(struct ext4_fc_tail) <= tl->fc_len); + case EXT4_FC_TAG_HEAD: + return (sizeof(struct ext4_fc_head) == tl->fc_len); + case EXT4_FC_TAG_DEL_RANGE: + case EXT4_FC_TAG_LINK: + case EXT4_FC_TAG_UNLINK: + case EXT4_FC_TAG_CREAT: + case EXT4_FC_TAG_INODE: + case EXT4_FC_TAG_PAD: + default: + return true; + } +} + /* * Recovery Scan phase handler * @@ -2028,12 +2060,18 @@ static int ext4_fc_replay_scan(journal_t *journal, } state->fc_replay_expected_off++; - for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) { - memcpy(&tl, cur, sizeof(tl)); - val = cur + sizeof(tl); + for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN; + cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { + ext4_fc_get_tl(&tl, cur); + val = cur + EXT4_FC_TAG_BASE_LEN; + if (!ext4_fc_tag_len_isvalid(&tl, val, end)) { + ret = state->fc_replay_num_tags ? + JBD2_FC_REPLAY_STOP : -ECANCELED; + goto out_err; + } ext4_debug("Scan phase, tag:%s, blk %lld\n", - tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr); - switch (le16_to_cpu(tl.fc_tag)) { + tag2str(tl.fc_tag), bh->b_blocknr); + switch (tl.fc_tag) { case EXT4_FC_TAG_ADD_RANGE: memcpy(&ext, val, sizeof(ext)); ex = (struct ext4_extent *)&ext.fc_ex; @@ -2053,13 +2091,13 @@ static int ext4_fc_replay_scan(journal_t *journal, case EXT4_FC_TAG_PAD: state->fc_cur_tag++; state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, - sizeof(tl) + le16_to_cpu(tl.fc_len)); + EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; case EXT4_FC_TAG_TAIL: state->fc_cur_tag++; memcpy(&tail, val, sizeof(tail)); state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, - sizeof(tl) + + EXT4_FC_TAG_BASE_LEN + offsetof(struct ext4_fc_tail, fc_crc)); if (le32_to_cpu(tail.fc_tid) == expected_tid && @@ -2086,7 +2124,7 @@ static int ext4_fc_replay_scan(journal_t *journal, } state->fc_cur_tag++; state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, - sizeof(tl) + le16_to_cpu(tl.fc_len)); + EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; default: ret = state->fc_replay_num_tags ? @@ -2141,19 +2179,20 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, start = (u8 *)bh->b_data; end = (__u8 *)bh->b_data + journal->j_blocksize - 1; - for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) { - memcpy(&tl, cur, sizeof(tl)); - val = cur + sizeof(tl); + for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN; + cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { + ext4_fc_get_tl(&tl, cur); + val = cur + EXT4_FC_TAG_BASE_LEN; if (state->fc_replay_num_tags == 0) { ret = JBD2_FC_REPLAY_STOP; ext4_fc_set_bitmaps_and_counters(sb); break; } - ext4_debug("Replay phase, tag:%s\n", - tag2str(le16_to_cpu(tl.fc_tag))); + + ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag)); state->fc_replay_num_tags--; - switch (le16_to_cpu(tl.fc_tag)) { + switch (tl.fc_tag) { case EXT4_FC_TAG_LINK: ret = ext4_fc_replay_link(sb, &tl, val); break; @@ -2174,19 +2213,18 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, break; case EXT4_FC_TAG_PAD: trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, - le16_to_cpu(tl.fc_len), 0); + tl.fc_len, 0); break; case EXT4_FC_TAG_TAIL: - trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, - le16_to_cpu(tl.fc_len), 0); + trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, + 0, tl.fc_len, 0); memcpy(&tail, val, sizeof(tail)); WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); break; case EXT4_FC_TAG_HEAD: break; default: - trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0, - le16_to_cpu(tl.fc_len), 0); + trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0); ret = -ECANCELED; break; } diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 1db12847a83b..a6154c3ed135 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -70,6 +70,9 @@ struct ext4_fc_tail { __le32 fc_crc; }; +/* Tag base length */ +#define EXT4_FC_TAG_BASE_LEN (sizeof(struct ext4_fc_tl)) + /* * Fast commit status codes */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 109d07629f81..a7a597c727e6 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -36,19 +36,34 @@ #include "acl.h" #include "truncate.h" -static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter) +/* + * Returns %true if the given DIO request should be attempted with DIO, or + * %false if it should fall back to buffered I/O. + * + * DIO isn't well specified; when it's unsupported (either due to the request + * being misaligned, or due to the file not supporting DIO at all), filesystems + * either fall back to buffered I/O or return EINVAL. For files that don't use + * any special features like encryption or verity, ext4 has traditionally + * returned EINVAL for misaligned DIO. iomap_dio_rw() uses this convention too. + * In this case, we should attempt the DIO, *not* fall back to buffered I/O. + * + * In contrast, in cases where DIO is unsupported due to ext4 features, ext4 + * traditionally falls back to buffered I/O. + * + * This function implements the traditional ext4 behavior in all these cases. + */ +static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter) { struct inode *inode = file_inode(iocb->ki_filp); + u32 dio_align = ext4_dio_alignment(inode); - if (!fscrypt_dio_supported(iocb, iter)) - return false; - if (fsverity_active(inode)) - return false; - if (ext4_should_journal_data(inode)) + if (dio_align == 0) return false; - if (ext4_has_inline_data(inode)) - return false; - return true; + + if (dio_align == 1) + return true; + + return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align); } static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -63,7 +78,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) inode_lock_shared(inode); } - if (!ext4_dio_supported(iocb, to)) { + if (!ext4_should_use_dio(iocb, to)) { inode_unlock_shared(inode); /* * Fallback to buffered I/O if the operation being performed on @@ -511,7 +526,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) } /* Fallback to buffered I/O if the inode does not support direct I/O. */ - if (!ext4_dio_supported(iocb, from)) { + if (!ext4_should_use_dio(iocb, from)) { if (ilock_shared) inode_unlock_shared(inode); else @@ -528,6 +543,12 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = -EAGAIN; goto out; } + /* + * Make sure inline data cannot be created anymore since we are going + * to allocate blocks for DIO. We know the inode does not have any + * inline data now because ext4_dio_supported() checked for that. + */ + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); offset = iocb->ki_pos; count = ret; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index f73e5eb43eae..e9bc46684106 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -463,10 +463,9 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, hinfo.hash_version = DX_HASH_HALF_MD4; hinfo.seed = sbi->s_hash_seed; ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo); - grp = hinfo.hash; + parent_group = hinfo.hash % ngroups; } else - grp = prandom_u32(); - parent_group = (unsigned)grp % ngroups; + parent_group = prandom_u32_max(ngroups); for (i = 0; i < ngroups; i++) { g = (parent_group + i) % ngroups; get_orlov_stats(sb, g, flex_size, &stats); @@ -510,7 +509,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, goto fallback; } - max_dirs = ndirs / ngroups + inodes_per_group / 16; + max_dirs = ndirs / ngroups + inodes_per_group*flex_size / 16; min_inodes = avefreei - inodes_per_group*flex_size / 4; if (min_inodes < 1) min_inodes = 1; @@ -1280,7 +1279,7 @@ got: EXT4_GROUP_INFO_IBITMAP_CORRUPT); goto out; } - inode->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); /* Precompute checksum seed for inode metadata */ if (ext4_has_metadata_csum(sb)) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 601214453c3a..2b5ef1b64249 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1188,6 +1188,13 @@ retry_grab: page = grab_cache_page_write_begin(mapping, index); if (!page) return -ENOMEM; + /* + * The same as page allocation, we prealloc buffer heads before + * starting the handle. + */ + if (!page_has_buffers(page)) + create_empty_buffers(page, inode->i_sb->s_blocksize, 0); + unlock_page(page); retry_journal: @@ -5342,6 +5349,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, int error, rc = 0; int orphan = 0; const unsigned int ia_valid = attr->ia_valid; + bool inc_ivers = true; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; @@ -5425,8 +5433,8 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, return -EINVAL; } - if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size) - inode_inc_iversion(inode); + if (attr->ia_size == inode->i_size) + inc_ivers = false; if (shrink) { if (ext4_should_order_data(inode)) { @@ -5528,6 +5536,8 @@ out_mmap_sem: } if (!error) { + if (inc_ivers) + inode_inc_iversion(inode); setattr_copy(mnt_userns, inode, attr); mark_inode_dirty(inode); } @@ -5550,6 +5560,22 @@ err_out: return error; } +u32 ext4_dio_alignment(struct inode *inode) +{ + if (fsverity_active(inode)) + return 0; + if (ext4_should_journal_data(inode)) + return 0; + if (ext4_has_inline_data(inode)) + return 0; + if (IS_ENCRYPTED(inode)) { + if (!fscrypt_dio_supported(inode)) + return 0; + return i_blocksize(inode); + } + return 1; /* use the iomap defaults */ +} + int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -5565,6 +5591,27 @@ int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path, stat->btime.tv_nsec = ei->i_crtime.tv_nsec; } + /* + * Return the DIO alignment restrictions if requested. We only return + * this information when requested, since on encrypted files it might + * take a fair bit of work to get if the file wasn't opened recently. + */ + if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { + u32 dio_align = ext4_dio_alignment(inode); + + stat->result_mask |= STATX_DIOALIGN; + if (dio_align == 1) { + struct block_device *bdev = inode->i_sb->s_bdev; + + /* iomap defaults */ + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; + stat->dio_offset_align = bdev_logical_block_size(bdev); + } else { + stat->dio_mem_align = dio_align; + stat->dio_offset_align = dio_align; + } + } + flags = ei->i_flags & EXT4_FL_USER_VISIBLE; if (flags & EXT4_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; @@ -5731,9 +5778,6 @@ int ext4_mark_iloc_dirty(handle_t *handle, } ext4_fc_track_inode(handle, inode); - if (IS_I_VERSION(inode)) - inode_inc_iversion(inode); - /* the do_update_inode consumes one bh->b_count */ get_bh(iloc->bh); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 3cf3ec4b1c21..ded535535b27 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -452,9 +452,10 @@ static long swap_inode_boot_loader(struct super_block *sb, swap_inode_data(inode, inode_bl); inode->i_ctime = inode_bl->i_ctime = current_time(inode); + inode_inc_iversion(inode); - inode->i_generation = prandom_u32(); - inode_bl->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); + inode_bl->i_generation = get_random_u32(); ext4_reset_inode_seed(inode); ext4_reset_inode_seed(inode_bl); @@ -665,6 +666,7 @@ static int ext4_ioctl_setflags(struct inode *inode, ext4_set_inode_flags(inode, false); inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); err = ext4_mark_iloc_dirty(handle, inode, &iloc); flags_err: @@ -775,6 +777,7 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) EXT4_I(inode)->i_projid = kprojid; inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); out_dirty: rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) @@ -1060,9 +1063,6 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg) if (!EXT4_SB(sb)->s_journal) return -ENODEV; - if (flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID) - return -EINVAL; - if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !bdev_max_discard_sectors(EXT4_SB(sb)->s_journal->j_dev)) return -EOPNOTSUPP; @@ -1257,6 +1257,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) err = ext4_reserve_inode_write(handle, inode, &iloc); if (err == 0) { inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); inode->i_generation = generation; err = ext4_mark_iloc_dirty(handle, inode, &iloc); } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index bd8f8b5c3d30..9dad93059945 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -140,13 +140,15 @@ * number of buddy bitmap orders possible) number of lists. Group-infos are * placed in appropriate lists. * - * 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root) + * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size) * - * Locking: sbi->s_mb_rb_lock (rwlock) + * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks) * - * This is a red black tree consisting of group infos and the tree is sorted - * by average fragment sizes (which is calculated as ext4_group_info->bb_free - * / ext4_group_info->bb_fragments). + * This is an array of lists where in the i-th list there are groups with + * average fragment size >= 2^i and < 2^(i+1). The average fragment size + * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments. + * Note that we don't bother with a special list for completely empty groups + * so we only have MB_NUM_ORDERS(sb) lists. * * When "mb_optimize_scan" mount option is set, mballoc consults the above data * structures to decide the order in which groups are to be traversed for @@ -160,7 +162,8 @@ * * At CR = 1, we only consider groups where average fragment size > request * size. So, we lookup a group which has average fragment size just above or - * equal to request size using our rb tree (data structure 2) in O(log N) time. + * equal to request size using our average fragment size group lists (data + * structure 2) in O(1) time. * * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in * linear order which requires O(N) search time for each CR 0 and CR 1 phase. @@ -802,65 +805,51 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, } } -static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new, - int (*cmp)(struct rb_node *, struct rb_node *)) +static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) { - struct rb_node **iter = &root->rb_node, *parent = NULL; + int order; - while (*iter) { - parent = *iter; - if (cmp(new, *iter) > 0) - iter = &((*iter)->rb_left); - else - iter = &((*iter)->rb_right); - } - - rb_link_node(new, parent, iter); - rb_insert_color(new, root); -} - -static int -ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2) -{ - struct ext4_group_info *grp1 = rb_entry(rb1, - struct ext4_group_info, - bb_avg_fragment_size_rb); - struct ext4_group_info *grp2 = rb_entry(rb2, - struct ext4_group_info, - bb_avg_fragment_size_rb); - int num_frags_1, num_frags_2; - - num_frags_1 = grp1->bb_fragments ? - grp1->bb_free / grp1->bb_fragments : 0; - num_frags_2 = grp2->bb_fragments ? - grp2->bb_free / grp2->bb_fragments : 0; - - return (num_frags_2 - num_frags_1); + /* + * We don't bother with a special lists groups with only 1 block free + * extents and for completely empty groups. + */ + order = fls(len) - 2; + if (order < 0) + return 0; + if (order == MB_NUM_ORDERS(sb)) + order--; + return order; } -/* - * Reinsert grpinfo into the avg_fragment_size tree with new average - * fragment size. - */ +/* Move group to appropriate avg_fragment_size list */ static void mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); + int new_order; if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) return; - write_lock(&sbi->s_mb_rb_lock); - if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) { - rb_erase(&grp->bb_avg_fragment_size_rb, - &sbi->s_mb_avg_fragment_size_root); - RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb); - } + new_order = mb_avg_fragment_size_order(sb, + grp->bb_free / grp->bb_fragments); + if (new_order == grp->bb_avg_fragment_size_order) + return; - ext4_mb_rb_insert(&sbi->s_mb_avg_fragment_size_root, - &grp->bb_avg_fragment_size_rb, - ext4_mb_avg_fragment_size_cmp); - write_unlock(&sbi->s_mb_rb_lock); + if (grp->bb_avg_fragment_size_order != -1) { + write_lock(&sbi->s_mb_avg_fragment_size_locks[ + grp->bb_avg_fragment_size_order]); + list_del(&grp->bb_avg_fragment_size_node); + write_unlock(&sbi->s_mb_avg_fragment_size_locks[ + grp->bb_avg_fragment_size_order]); + } + grp->bb_avg_fragment_size_order = new_order; + write_lock(&sbi->s_mb_avg_fragment_size_locks[ + grp->bb_avg_fragment_size_order]); + list_add_tail(&grp->bb_avg_fragment_size_node, + &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]); + write_unlock(&sbi->s_mb_avg_fragment_size_locks[ + grp->bb_avg_fragment_size_order]); } /* @@ -909,86 +898,55 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, *new_cr = 1; } else { *group = grp->bb_group; - ac->ac_last_optimal_group = *group; ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; } } /* - * Choose next group by traversing average fragment size tree. Updates *new_cr - * if cr lvel needs an update. Sets EXT4_MB_SEARCH_NEXT_LINEAR to indicate that - * the linear search should continue for one iteration since there's lock - * contention on the rb tree lock. + * Choose next group by traversing average fragment size list of suitable + * order. Updates *new_cr if cr level needs an update. */ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, int *new_cr, ext4_group_t *group, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - int avg_fragment_size, best_so_far; - struct rb_node *node, *found; - struct ext4_group_info *grp; - - /* - * If there is contention on the lock, instead of waiting for the lock - * to become available, just continue searching lineraly. We'll resume - * our rb tree search later starting at ac->ac_last_optimal_group. - */ - if (!read_trylock(&sbi->s_mb_rb_lock)) { - ac->ac_flags |= EXT4_MB_SEARCH_NEXT_LINEAR; - return; - } + struct ext4_group_info *grp = NULL, *iter; + int i; if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { if (sbi->s_mb_stats) atomic_inc(&sbi->s_bal_cr1_bad_suggestions); - /* We have found something at CR 1 in the past */ - grp = ext4_get_group_info(ac->ac_sb, ac->ac_last_optimal_group); - for (found = rb_next(&grp->bb_avg_fragment_size_rb); found != NULL; - found = rb_next(found)) { - grp = rb_entry(found, struct ext4_group_info, - bb_avg_fragment_size_rb); + } + + for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); + i < MB_NUM_ORDERS(ac->ac_sb); i++) { + if (list_empty(&sbi->s_mb_avg_fragment_size[i])) + continue; + read_lock(&sbi->s_mb_avg_fragment_size_locks[i]); + if (list_empty(&sbi->s_mb_avg_fragment_size[i])) { + read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); + continue; + } + list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i], + bb_avg_fragment_size_node) { if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[1]); - if (likely(ext4_mb_good_group(ac, grp->bb_group, 1))) + if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) { + grp = iter; break; - } - goto done; - } - - node = sbi->s_mb_avg_fragment_size_root.rb_node; - best_so_far = 0; - found = NULL; - - while (node) { - grp = rb_entry(node, struct ext4_group_info, - bb_avg_fragment_size_rb); - avg_fragment_size = 0; - if (ext4_mb_good_group(ac, grp->bb_group, 1)) { - avg_fragment_size = grp->bb_fragments ? - grp->bb_free / grp->bb_fragments : 0; - if (!best_so_far || avg_fragment_size < best_so_far) { - best_so_far = avg_fragment_size; - found = node; } } - if (avg_fragment_size > ac->ac_g_ex.fe_len) - node = node->rb_right; - else - node = node->rb_left; + read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); + if (grp) + break; } -done: - if (found) { - grp = rb_entry(found, struct ext4_group_info, - bb_avg_fragment_size_rb); + if (grp) { *group = grp->bb_group; ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; } else { *new_cr = 2; } - - read_unlock(&sbi->s_mb_rb_lock); - ac->ac_last_optimal_group = *group; } static inline int should_optimize_scan(struct ext4_allocation_context *ac) @@ -1017,11 +975,6 @@ next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups) goto inc_and_return; } - if (ac->ac_flags & EXT4_MB_SEARCH_NEXT_LINEAR) { - ac->ac_flags &= ~EXT4_MB_SEARCH_NEXT_LINEAR; - goto inc_and_return; - } - return group; inc_and_return: /* @@ -1049,8 +1002,10 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, { *new_cr = ac->ac_criteria; - if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) + if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) { + *group = next_linear_group(ac, *group, ngroups); return; + } if (*new_cr == 0) { ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); @@ -1075,23 +1030,25 @@ mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) struct ext4_sb_info *sbi = EXT4_SB(sb); int i; - if (test_opt2(sb, MB_OPTIMIZE_SCAN) && grp->bb_largest_free_order >= 0) { + for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) + if (grp->bb_counters[i] > 0) + break; + /* No need to move between order lists? */ + if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || + i == grp->bb_largest_free_order) { + grp->bb_largest_free_order = i; + return; + } + + if (grp->bb_largest_free_order >= 0) { write_lock(&sbi->s_mb_largest_free_orders_locks[ grp->bb_largest_free_order]); list_del_init(&grp->bb_largest_free_order_node); write_unlock(&sbi->s_mb_largest_free_orders_locks[ grp->bb_largest_free_order]); } - grp->bb_largest_free_order = -1; /* uninit */ - - for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) { - if (grp->bb_counters[i] > 0) { - grp->bb_largest_free_order = i; - break; - } - } - if (test_opt2(sb, MB_OPTIMIZE_SCAN) && - grp->bb_largest_free_order >= 0 && grp->bb_free) { + grp->bb_largest_free_order = i; + if (grp->bb_largest_free_order >= 0 && grp->bb_free) { write_lock(&sbi->s_mb_largest_free_orders_locks[ grp->bb_largest_free_order]); list_add_tail(&grp->bb_largest_free_order_node, @@ -1148,13 +1105,13 @@ void ext4_mb_generate_buddy(struct super_block *sb, EXT4_GROUP_INFO_BBITMAP_CORRUPT); } mb_set_largest_free_order(sb, grp); + mb_update_avg_fragment_size(sb, grp); clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); period = get_cycles() - period; atomic_inc(&sbi->s_mb_buddies_generated); atomic64_add(period, &sbi->s_mb_generation_time); - mb_update_avg_fragment_size(sb, grp); } /* The buddy information is attached the buddy cache inode @@ -2636,7 +2593,7 @@ static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { ext4_group_t prefetch_grp = 0, ngroups, group, i; - int cr = -1; + int cr = -1, new_cr; int err = 0, first_err = 0; unsigned int nr = 0, prefetch_ios = 0; struct ext4_sb_info *sbi; @@ -2707,17 +2664,14 @@ repeat: * from the goal value specified */ group = ac->ac_g_ex.fe_group; - ac->ac_last_optimal_group = group; ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; prefetch_grp = group; - for (i = 0; i < ngroups; group = next_linear_group(ac, group, ngroups), - i++) { - int ret = 0, new_cr; + for (i = 0, new_cr = cr; i < ngroups; i++, + ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { + int ret = 0; cond_resched(); - - ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups); if (new_cr != cr) { cr = new_cr; goto repeat; @@ -2991,9 +2945,7 @@ __acquires(&EXT4_SB(sb)->s_mb_rb_lock) struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; - read_lock(&EXT4_SB(sb)->s_mb_rb_lock); - - if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) + if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) return NULL; position = *pos + 1; return (void *) ((unsigned long) position); @@ -3005,7 +2957,7 @@ static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, lof unsigned long position; ++*pos; - if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) + if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) return NULL; position = *pos + 1; return (void *) ((unsigned long) position); @@ -3017,29 +2969,22 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned long position = ((unsigned long) v); struct ext4_group_info *grp; - struct rb_node *n; - unsigned int count, min, max; + unsigned int count; position--; if (position >= MB_NUM_ORDERS(sb)) { - seq_puts(seq, "fragment_size_tree:\n"); - n = rb_first(&sbi->s_mb_avg_fragment_size_root); - if (!n) { - seq_puts(seq, "\ttree_min: 0\n\ttree_max: 0\n\ttree_nodes: 0\n"); - return 0; - } - grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb); - min = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0; - count = 1; - while (rb_next(n)) { - count++; - n = rb_next(n); - } - grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb); - max = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0; + position -= MB_NUM_ORDERS(sb); + if (position == 0) + seq_puts(seq, "avg_fragment_size_lists:\n"); - seq_printf(seq, "\ttree_min: %u\n\ttree_max: %u\n\ttree_nodes: %u\n", - min, max, count); + count = 0; + read_lock(&sbi->s_mb_avg_fragment_size_locks[position]); + list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position], + bb_avg_fragment_size_node) + count++; + read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]); + seq_printf(seq, "\tlist_order_%u_groups: %u\n", + (unsigned int)position, count); return 0; } @@ -3049,9 +2994,11 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) seq_puts(seq, "max_free_order_lists:\n"); } count = 0; + read_lock(&sbi->s_mb_largest_free_orders_locks[position]); list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position], bb_largest_free_order_node) count++; + read_unlock(&sbi->s_mb_largest_free_orders_locks[position]); seq_printf(seq, "\tlist_order_%u_groups: %u\n", (unsigned int)position, count); @@ -3059,11 +3006,7 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) } static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) -__releases(&EXT4_SB(sb)->s_mb_rb_lock) { - struct super_block *sb = pde_data(file_inode(seq->file)); - - read_unlock(&EXT4_SB(sb)->s_mb_rb_lock); } const struct seq_operations ext4_mb_seq_structs_summary_ops = { @@ -3176,8 +3119,9 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, init_rwsem(&meta_group_info[i]->alloc_sem); meta_group_info[i]->bb_free_root = RB_ROOT; INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); - RB_CLEAR_NODE(&meta_group_info[i]->bb_avg_fragment_size_rb); + INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node); meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ + meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */ meta_group_info[i]->bb_group = group; mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); @@ -3426,7 +3370,24 @@ int ext4_mb_init(struct super_block *sb) i++; } while (i < MB_NUM_ORDERS(sb)); - sbi->s_mb_avg_fragment_size_root = RB_ROOT; + sbi->s_mb_avg_fragment_size = + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), + GFP_KERNEL); + if (!sbi->s_mb_avg_fragment_size) { + ret = -ENOMEM; + goto out; + } + sbi->s_mb_avg_fragment_size_locks = + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), + GFP_KERNEL); + if (!sbi->s_mb_avg_fragment_size_locks) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < MB_NUM_ORDERS(sb); i++) { + INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]); + rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]); + } sbi->s_mb_largest_free_orders = kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), GFP_KERNEL); @@ -3445,7 +3406,6 @@ int ext4_mb_init(struct super_block *sb) INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); } - rwlock_init(&sbi->s_mb_rb_lock); spin_lock_init(&sbi->s_md_lock); sbi->s_mb_free_pending = 0; @@ -3516,6 +3476,8 @@ out_free_locality_groups: free_percpu(sbi->s_locality_groups); sbi->s_locality_groups = NULL; out: + kfree(sbi->s_mb_avg_fragment_size); + kfree(sbi->s_mb_avg_fragment_size_locks); kfree(sbi->s_mb_largest_free_orders); kfree(sbi->s_mb_largest_free_orders_locks); kfree(sbi->s_mb_offsets); @@ -3582,6 +3544,8 @@ int ext4_mb_release(struct super_block *sb) kvfree(group_info); rcu_read_unlock(); } + kfree(sbi->s_mb_avg_fragment_size); + kfree(sbi->s_mb_avg_fragment_size_locks); kfree(sbi->s_mb_largest_free_orders); kfree(sbi->s_mb_largest_free_orders_locks); kfree(sbi->s_mb_offsets); @@ -5193,6 +5157,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int bsbits = ac->ac_sb->s_blocksize_bits; loff_t size, isize; + bool inode_pa_eligible, group_pa_eligible; if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return; @@ -5200,25 +5165,27 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) return; + group_pa_eligible = sbi->s_mb_group_prealloc > 0; + inode_pa_eligible = true; size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) >> bsbits; + /* No point in using inode preallocation for closed files */ if ((size == isize) && !ext4_fs_is_busy(sbi) && - !inode_is_open_for_write(ac->ac_inode)) { - ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; - return; - } + !inode_is_open_for_write(ac->ac_inode)) + inode_pa_eligible = false; - if (sbi->s_mb_group_prealloc <= 0) { - ac->ac_flags |= EXT4_MB_STREAM_ALLOC; - return; - } - - /* don't use group allocation for large files */ size = max(size, isize); - if (size > sbi->s_mb_stream_request) { - ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + /* Don't use group allocation for large files */ + if (size > sbi->s_mb_stream_request) + group_pa_eligible = false; + + if (!group_pa_eligible) { + if (inode_pa_eligible) + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + else + ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; return; } @@ -5565,6 +5532,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ext4_fsblk_t block = 0; unsigned int inquota = 0; unsigned int reserv_clstrs = 0; + int retries = 0; u64 seq; might_sleep(); @@ -5667,7 +5635,8 @@ repeat: ar->len = ac->ac_b_ex.fe_len; } } else { - if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) + if (++retries < 3 && + ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) goto repeat; /* * If block allocation fails then the pa allocated above diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 39da92ceabf8..dcda2a943cee 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -178,7 +178,6 @@ struct ext4_allocation_context { /* copy of the best found extent taken before preallocation efforts */ struct ext4_free_extent ac_f_ex; - ext4_group_t ac_last_optimal_group; __u32 ac_groups_considered; __u32 ac_flags; /* allocation hints */ __u16 ac_groups_scanned; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 54e7d3c95fd7..0a220ec9862d 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -56,8 +56,7 @@ static int finish_range(handle_t *handle, struct inode *inode, retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0); err_out: up_write((&EXT4_I(inode)->i_data_sem)); - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); lb->first_pblock = 0; return retval; } diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 9af68a7ecdcf..588cb09c5291 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -265,7 +265,7 @@ static unsigned int mmp_new_seq(void) u32 new_seq; do { - new_seq = prandom_u32(); + new_seq = get_random_u32(); } while (new_seq > EXT4_MMP_SEQ_MAX); return new_seq; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 701f1d6a217f..044e34cd835c 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -32,8 +32,7 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock, if (IS_ERR(path)) return PTR_ERR(path); if (path[ext_depth(inode)].p_ext == NULL) { - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); *ppath = NULL; return -ENODATA; } @@ -103,12 +102,10 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, if (unwritten != ext4_ext_is_unwritten(ext)) goto out; from += ext4_ext_get_actual_len(ext); - ext4_ext_drop_refs(path); } ret = 1; out: - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); return ret; } @@ -472,19 +469,17 @@ mext_check_arguments(struct inode *orig_inode, if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) return -EPERM; - /* Ext4 move extent does not support swapfile */ + /* Ext4 move extent does not support swap files */ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { - ext4_debug("ext4 move extent: The argument files should " - "not be swapfile [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); - return -EBUSY; + return -ETXTBSY; } if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) { - ext4_debug("ext4 move extent: The argument files should " - "not be quota files [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); - return -EBUSY; + return -EOPNOTSUPP; } /* Ext4 move extent supports only extent based file */ @@ -631,11 +626,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, if (ret) goto out; ex = path[path->p_depth].p_ext; - next_blk = ext4_ext_next_allocated_block(path); cur_blk = le32_to_cpu(ex->ee_block); cur_len = ext4_ext_get_actual_len(ex); /* Check hole before the start pos */ if (cur_blk + cur_len - 1 < o_start) { + next_blk = ext4_ext_next_allocated_block(path); if (next_blk == EXT_MAX_BLOCKS) { ret = -ENODATA; goto out; @@ -663,7 +658,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, donor_page_index = d_start >> (PAGE_SHIFT - donor_inode->i_blkbits); offset_in_page = o_start % blocks_per_page; - if (cur_len > blocks_per_page- offset_in_page) + if (cur_len > blocks_per_page - offset_in_page) cur_len = blocks_per_page - offset_in_page; /* * Up semaphore to avoid following problems: @@ -694,8 +689,7 @@ out: ext4_discard_preallocations(donor_inode, 0); } - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); ext4_double_up_write_data_sem(orig_inode, donor_inode); unlock_two_nondirectories(orig_inode, donor_inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 3a31b662f661..d5daaf41e1fc 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -85,15 +85,20 @@ static struct buffer_head *ext4_append(handle_t *handle, return bh; inode->i_size += inode->i_sb->s_blocksize; EXT4_I(inode)->i_disksize = inode->i_size; + err = ext4_mark_inode_dirty(handle, inode); + if (err) + goto out; BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); - if (err) { - brelse(bh); - ext4_std_error(inode->i_sb, err); - return ERR_PTR(err); - } + if (err) + goto out; return bh; + +out: + brelse(bh); + ext4_std_error(inode->i_sb, err); + return ERR_PTR(err); } static int ext4_dx_csum_verify(struct inode *inode, @@ -126,7 +131,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, struct ext4_dir_entry *dirent; int is_dx_block = 0; - if (block >= inode->i_size) { + if (block >= inode->i_size >> inode->i_blkbits) { ext4_error_inode(inode, func, line, block, "Attempting to read directory block (%u) that is past i_size (%llu)", block, inode->i_size); @@ -2849,7 +2854,7 @@ retry: } static int ext4_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { handle_t *handle; struct inode *inode; @@ -2871,7 +2876,7 @@ retry: inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); err = ext4_orphan_add(handle, inode); if (err) goto err_unlock_inode; @@ -2882,7 +2887,7 @@ retry: ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; - return err; + return finish_open_simple(file, err); err_unlock_inode: ext4_journal_stop(handle); unlock_new_inode(inode); diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index e02a5f14e021..3d21eae267fc 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -75,7 +75,7 @@ static void __read_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, iter_all) { page = bv->bv_page; - /* PG_error was set if any post_read step failed */ + /* PG_error was set if verity failed. */ if (bio->bi_status || PageError(page)) { ClearPageUptodate(page); /* will re-read again later */ @@ -96,10 +96,12 @@ static void decrypt_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); + struct bio *bio = ctx->bio; - fscrypt_decrypt_bio(ctx->bio); - - bio_post_read_processing(ctx); + if (fscrypt_decrypt_bio(bio)) + bio_post_read_processing(ctx); + else + __read_end_io(bio); } static void verity_work(struct work_struct *work) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index fea2a68d067b..6dfe9ccae0c5 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -2122,7 +2122,7 @@ retry: goto out; } - if (ext4_blocks_count(es) == n_blocks_count) + if (ext4_blocks_count(es) == n_blocks_count && n_blocks_count_retry == 0) goto out; err = ext4_alloc_flex_bg_array(sb, n_group + 1); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9a66abcca1a8..989365b878a6 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -205,19 +205,12 @@ int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait) { - if (trylock_buffer(bh)) { - if (wait) - return ext4_read_bh(bh, op_flags, NULL); + lock_buffer(bh); + if (!wait) { ext4_read_bh_nowait(bh, op_flags, NULL); return 0; } - if (wait) { - wait_on_buffer(bh); - if (buffer_uptodate(bh)) - return 0; - return -EIO; - } - return 0; + return ext4_read_bh(bh, op_flags, NULL); } /* @@ -264,7 +257,8 @@ void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) struct buffer_head *bh = sb_getblk_gfp(sb, block, 0); if (likely(bh)) { - ext4_read_bh_lock(bh, REQ_RAHEAD, false); + if (trylock_buffer(bh)) + ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL); brelse(bh); } } @@ -1576,7 +1570,7 @@ enum { Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid, Opt_sb, Opt_nouid32, Opt_debug, Opt_removed, - Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, + Opt_user_xattr, Opt_acl, Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev, Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, @@ -1585,7 +1579,7 @@ enum { Opt_inlinecrypt, Opt_usrjquota, Opt_grpjquota, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, + Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize, @@ -1662,9 +1656,7 @@ static const struct fs_parameter_spec ext4_param_specs[] = { fsparam_flag ("oldalloc", Opt_removed), fsparam_flag ("orlov", Opt_removed), fsparam_flag ("user_xattr", Opt_user_xattr), - fsparam_flag ("nouser_xattr", Opt_nouser_xattr), fsparam_flag ("acl", Opt_acl), - fsparam_flag ("noacl", Opt_noacl), fsparam_flag ("norecovery", Opt_noload), fsparam_flag ("noload", Opt_noload), fsparam_flag ("bh", Opt_removed), @@ -1694,7 +1686,7 @@ static const struct fs_parameter_spec ext4_param_specs[] = { fsparam_flag ("barrier", Opt_barrier), fsparam_u32 ("barrier", Opt_barrier), fsparam_flag ("nobarrier", Opt_nobarrier), - fsparam_flag ("i_version", Opt_i_version), + fsparam_flag ("i_version", Opt_removed), fsparam_flag ("dax", Opt_dax), fsparam_enum ("dax", Opt_dax_type, ext4_param_dax), fsparam_u32 ("stripe", Opt_stripe), @@ -1814,13 +1806,10 @@ static const struct mount_opts { {Opt_journal_ioprio, 0, MOPT_NO_EXT2}, {Opt_data, 0, MOPT_NO_EXT2}, {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, - {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR}, #ifdef CONFIG_EXT4_FS_POSIX_ACL {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, - {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR}, #else {Opt_acl, 0, MOPT_NOSUPPORT}, - {Opt_noacl, 0, MOPT_NOSUPPORT}, #endif {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET}, {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET}, @@ -2120,10 +2109,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) else return note_qf_name(fc, GRPQUOTA, param); #endif - case Opt_noacl: - case Opt_nouser_xattr: - ext4_msg(NULL, KERN_WARNING, deprecated_msg, param->key, "3.5"); - break; case Opt_sb: if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { ext4_msg(NULL, KERN_WARNING, @@ -2140,11 +2125,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_abort: ctx_set_mount_flag(ctx, EXT4_MF_FS_ABORTED); return 0; - case Opt_i_version: - ext4_msg(NULL, KERN_WARNING, deprecated_msg, param->key, "5.20"); - ext4_msg(NULL, KERN_WARNING, "Use iversion instead\n"); - ctx_set_flags(ctx, SB_I_VERSION); - return 0; case Opt_inlinecrypt: #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT ctx_set_flags(ctx, SB_INLINECRYPT); @@ -2814,14 +2794,6 @@ static void ext4_apply_options(struct fs_context *fc, struct super_block *sb) sb->s_flags &= ~ctx->mask_s_flags; sb->s_flags |= ctx->vals_s_flags; - /* - * i_version differs from common mount option iversion so we have - * to let vfs know that it was set, otherwise it would get cleared - * on remount - */ - if (ctx->mask_s_flags & SB_I_VERSION) - fc->sb_flags |= SB_I_VERSION; - #define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; }) APPLY(s_commit_interval); APPLY(s_stripe); @@ -2970,8 +2942,6 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); - if (sb->s_flags & SB_I_VERSION) - SEQ_OPTS_PUTS("i_version"); if (nodefs || sbi->s_stripe) SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); if (nodefs || EXT4_MOUNT_DATA_FLAGS & @@ -3767,6 +3737,7 @@ static int ext4_lazyinit_thread(void *arg) unsigned long next_wakeup, cur; BUG_ON(NULL == eli); + set_freezable(); cont_thread: while (true) { @@ -3811,8 +3782,7 @@ cont_thread: } if (!progress) { elr->lr_next_sched = jiffies + - (prandom_u32() - % (EXT4_DEF_LI_MAX_START_DELAY * HZ)); + prandom_u32_max(EXT4_DEF_LI_MAX_START_DELAY * HZ); } if (time_before(elr->lr_next_sched, next_wakeup)) next_wakeup = elr->lr_next_sched; @@ -3959,8 +3929,8 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, * spread the inode table initialization requests * better. */ - elr->lr_next_sched = jiffies + (prandom_u32() % - (EXT4_DEF_LI_MAX_START_DELAY * HZ)); + elr->lr_next_sched = jiffies + prandom_u32_max( + EXT4_DEF_LI_MAX_START_DELAY * HZ); return elr; } @@ -3982,9 +3952,9 @@ int ext4_register_li_request(struct super_block *sb, goto out; } - if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && - (first_not_zeroed == ngroups || sb_rdonly(sb) || - !test_opt(sb, INIT_INODE_TABLE))) + if (sb_rdonly(sb) || + (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && + (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE)))) goto out; elr = ext4_li_request_new(sb, first_not_zeroed); @@ -4311,112 +4281,10 @@ err_out: return NULL; } -static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) +static void ext4_set_def_opts(struct super_block *sb, + struct ext4_super_block *es) { - struct buffer_head *bh, **group_desc; - struct ext4_super_block *es = NULL; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct flex_groups **flex_groups; - ext4_fsblk_t block; - ext4_fsblk_t logical_sb_block; - unsigned long offset = 0; unsigned long def_mount_opts; - struct inode *root; - int ret = -ENOMEM; - int blocksize, clustersize; - unsigned int db_count; - unsigned int i; - int needs_recovery, has_huge_files; - __u64 blocks_count; - int err = 0; - ext4_group_t first_not_zeroed; - struct ext4_fs_context *ctx = fc->fs_private; - int silent = fc->sb_flags & SB_SILENT; - - /* Set defaults for the variables that will be set during parsing */ - if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) - ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; - - sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; - sbi->s_sectors_written_start = - part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); - - /* -EINVAL is default */ - ret = -EINVAL; - blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); - if (!blocksize) { - ext4_msg(sb, KERN_ERR, "unable to set blocksize"); - goto out_fail; - } - - /* - * The ext4 superblock will not be buffer aligned for other than 1kB - * block sizes. We need to calculate the offset from buffer start. - */ - if (blocksize != EXT4_MIN_BLOCK_SIZE) { - logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; - offset = do_div(logical_sb_block, blocksize); - } else { - logical_sb_block = sbi->s_sb_block; - } - - bh = ext4_sb_bread_unmovable(sb, logical_sb_block); - if (IS_ERR(bh)) { - ext4_msg(sb, KERN_ERR, "unable to read superblock"); - ret = PTR_ERR(bh); - goto out_fail; - } - /* - * Note: s_es must be initialized as soon as possible because - * some ext4 macro-instructions depend on its value - */ - es = (struct ext4_super_block *) (bh->b_data + offset); - sbi->s_es = es; - sb->s_magic = le16_to_cpu(es->s_magic); - if (sb->s_magic != EXT4_SUPER_MAGIC) - goto cantfind_ext4; - sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); - - /* Warn if metadata_csum and gdt_csum are both set. */ - if (ext4_has_feature_metadata_csum(sb) && - ext4_has_feature_gdt_csum(sb)) - ext4_warning(sb, "metadata_csum and uninit_bg are " - "redundant flags; please run fsck."); - - /* Check for a known checksum algorithm */ - if (!ext4_verify_csum_type(sb, es)) { - ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " - "unknown checksum algorithm."); - silent = 1; - goto cantfind_ext4; - } - ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE, - ext4_orphan_file_block_trigger); - - /* Load the checksum driver */ - sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); - if (IS_ERR(sbi->s_chksum_driver)) { - ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); - ret = PTR_ERR(sbi->s_chksum_driver); - sbi->s_chksum_driver = NULL; - goto failed_mount; - } - - /* Check superblock checksum */ - if (!ext4_superblock_csum_verify(sb, es)) { - ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " - "invalid superblock checksum. Run e2fsck?"); - silent = 1; - ret = -EFSBADCRC; - goto cantfind_ext4; - } - - /* Precompute checksum seed for all metadata */ - if (ext4_has_feature_csum_seed(sb)) - sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); - else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) - sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, - sizeof(es->s_uuid)); /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); @@ -4445,9 +4313,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) set_opt(sb, WRITEBACK_DATA); - if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) + if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_PANIC) set_opt(sb, ERRORS_PANIC); - else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE) + else if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_CONTINUE) set_opt(sb, ERRORS_CONT); else set_opt(sb, ERRORS_RO); @@ -4456,12 +4324,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (def_mount_opts & EXT4_DEFM_DISCARD) set_opt(sb, DISCARD); - sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); - sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); - sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; - sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; - sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; - if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) set_opt(sb, BARRIER); @@ -4473,31 +4335,96 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sb, DELALLOC); - /* - * set default s_li_wait_mult for lazyinit, for the case there is - * no mount option specified. - */ - sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; + if (sb->s_blocksize == PAGE_SIZE) + set_opt(sb, DIOREAD_NOLOCK); +} - if (le32_to_cpu(es->s_log_block_size) > - (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { - ext4_msg(sb, KERN_ERR, - "Invalid log block size: %u", - le32_to_cpu(es->s_log_block_size)); - goto failed_mount; - } - if (le32_to_cpu(es->s_log_cluster_size) > - (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { - ext4_msg(sb, KERN_ERR, - "Invalid log cluster size: %u", - le32_to_cpu(es->s_log_cluster_size)); - goto failed_mount; +static int ext4_handle_clustersize(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int clustersize; + + /* Handle clustersize */ + clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); + if (ext4_has_feature_bigalloc(sb)) { + if (clustersize < sb->s_blocksize) { + ext4_msg(sb, KERN_ERR, + "cluster size (%d) smaller than " + "block size (%lu)", clustersize, sb->s_blocksize); + return -EINVAL; + } + sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - + le32_to_cpu(es->s_log_block_size); + sbi->s_clusters_per_group = + le32_to_cpu(es->s_clusters_per_group); + if (sbi->s_clusters_per_group > sb->s_blocksize * 8) { + ext4_msg(sb, KERN_ERR, + "#clusters per group too big: %lu", + sbi->s_clusters_per_group); + return -EINVAL; + } + if (sbi->s_blocks_per_group != + (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) { + ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and " + "clusters per group (%lu) inconsistent", + sbi->s_blocks_per_group, + sbi->s_clusters_per_group); + return -EINVAL; + } + } else { + if (clustersize != sb->s_blocksize) { + ext4_msg(sb, KERN_ERR, + "fragment/cluster size (%d) != " + "block size (%lu)", clustersize, sb->s_blocksize); + return -EINVAL; + } + if (sbi->s_blocks_per_group > sb->s_blocksize * 8) { + ext4_msg(sb, KERN_ERR, + "#blocks per group too big: %lu", + sbi->s_blocks_per_group); + return -EINVAL; + } + sbi->s_clusters_per_group = sbi->s_blocks_per_group; + sbi->s_cluster_bits = 0; } + sbi->s_cluster_ratio = clustersize / sb->s_blocksize; - blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + /* Do we have standard group size of clustersize * 8 blocks ? */ + if (sbi->s_blocks_per_group == clustersize << 3) + set_opt2(sb, STD_GROUP_SIZE); - if (blocksize == PAGE_SIZE) - set_opt(sb, DIOREAD_NOLOCK); + return 0; +} + +static void ext4_fast_commit_init(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* Initialize fast commit stuff */ + atomic_set(&sbi->s_fc_subtid, 0); + INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]); + INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]); + INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); + INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); + sbi->s_fc_bytes = 0; + ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + sbi->s_fc_ineligible_tid = 0; + spin_lock_init(&sbi->s_fc_lock); + memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); + sbi->s_fc_replay_state.fc_regions = NULL; + sbi->s_fc_replay_state.fc_regions_size = 0; + sbi->s_fc_replay_state.fc_regions_used = 0; + sbi->s_fc_replay_state.fc_regions_valid = 0; + sbi->s_fc_replay_state.fc_modified_inodes = NULL; + sbi->s_fc_replay_state.fc_modified_inodes_size = 0; + sbi->s_fc_replay_state.fc_modified_inodes_used = 0; +} + +static int ext4_inode_info_init(struct super_block *sb, + struct ext4_super_block *es) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; @@ -4508,16 +4435,16 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) { ext4_msg(sb, KERN_ERR, "invalid first ino: %u", sbi->s_first_ino); - goto failed_mount; + return -EINVAL; } if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || (!is_power_of_2(sbi->s_inode_size)) || - (sbi->s_inode_size > blocksize)) { + (sbi->s_inode_size > sb->s_blocksize)) { ext4_msg(sb, KERN_ERR, "unsupported inode size: %d", sbi->s_inode_size); - ext4_msg(sb, KERN_ERR, "blocksize: %d", blocksize); - goto failed_mount; + ext4_msg(sb, KERN_ERR, "blocksize: %lu", sb->s_blocksize); + return -EINVAL; } /* * i_atime_extra is the last extra field available for @@ -4535,6 +4462,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) } sb->s_time_min = EXT4_TIMESTAMP_MIN; } + if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { sbi->s_want_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; @@ -4546,7 +4474,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (v > max) { ext4_msg(sb, KERN_ERR, "bad s_want_extra_isize: %d", v); - goto failed_mount; + return -EINVAL; } if (sbi->s_want_extra_isize < v) sbi->s_want_extra_isize = v; @@ -4555,91 +4483,112 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (v > max) { ext4_msg(sb, KERN_ERR, "bad s_min_extra_isize: %d", v); - goto failed_mount; + return -EINVAL; } if (sbi->s_want_extra_isize < v) sbi->s_want_extra_isize = v; } } - err = parse_apply_sb_mount_options(sb, ctx); - if (err < 0) - goto failed_mount; + return 0; +} - sbi->s_def_mount_opt = sbi->s_mount_opt; +#if IS_ENABLED(CONFIG_UNICODE) +static int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es) +{ + const struct ext4_sb_encodings *encoding_info; + struct unicode_map *encoding; + __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags); - err = ext4_check_opt_consistency(fc, sb); - if (err < 0) - goto failed_mount; + if (!ext4_has_feature_casefold(sb) || sb->s_encoding) + return 0; - ext4_apply_options(fc, sb); + encoding_info = ext4_sb_read_encoding(es); + if (!encoding_info) { + ext4_msg(sb, KERN_ERR, + "Encoding requested by superblock is unknown"); + return -EINVAL; + } -#if IS_ENABLED(CONFIG_UNICODE) - if (ext4_has_feature_casefold(sb) && !sb->s_encoding) { - const struct ext4_sb_encodings *encoding_info; - struct unicode_map *encoding; - __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags); + encoding = utf8_load(encoding_info->version); + if (IS_ERR(encoding)) { + ext4_msg(sb, KERN_ERR, + "can't mount with superblock charset: %s-%u.%u.%u " + "not supported by the kernel. flags: 0x%x.", + encoding_info->name, + unicode_major(encoding_info->version), + unicode_minor(encoding_info->version), + unicode_rev(encoding_info->version), + encoding_flags); + return -EINVAL; + } + ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: " + "%s-%u.%u.%u with flags 0x%hx", encoding_info->name, + unicode_major(encoding_info->version), + unicode_minor(encoding_info->version), + unicode_rev(encoding_info->version), + encoding_flags); - encoding_info = ext4_sb_read_encoding(es); - if (!encoding_info) { - ext4_msg(sb, KERN_ERR, - "Encoding requested by superblock is unknown"); - goto failed_mount; - } + sb->s_encoding = encoding; + sb->s_encoding_flags = encoding_flags; - encoding = utf8_load(encoding_info->version); - if (IS_ERR(encoding)) { - ext4_msg(sb, KERN_ERR, - "can't mount with superblock charset: %s-%u.%u.%u " - "not supported by the kernel. flags: 0x%x.", - encoding_info->name, - unicode_major(encoding_info->version), - unicode_minor(encoding_info->version), - unicode_rev(encoding_info->version), - encoding_flags); - goto failed_mount; - } - ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: " - "%s-%u.%u.%u with flags 0x%hx", encoding_info->name, - unicode_major(encoding_info->version), - unicode_minor(encoding_info->version), - unicode_rev(encoding_info->version), - encoding_flags); + return 0; +} +#else +static inline int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es) +{ + return 0; +} +#endif - sb->s_encoding = encoding; - sb->s_encoding_flags = encoding_flags; +static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_block *es) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* Warn if metadata_csum and gdt_csum are both set. */ + if (ext4_has_feature_metadata_csum(sb) && + ext4_has_feature_gdt_csum(sb)) + ext4_warning(sb, "metadata_csum and uninit_bg are " + "redundant flags; please run fsck."); + + /* Check for a known checksum algorithm */ + if (!ext4_verify_csum_type(sb, es)) { + ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " + "unknown checksum algorithm."); + return -EINVAL; } -#endif + ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE, + ext4_orphan_file_block_trigger); - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, O_DIRECT and fast_commit support!\n"); - /* can't mount with both data=journal and dioread_nolock. */ - clear_opt(sb, DIOREAD_NOLOCK); - clear_opt2(sb, JOURNAL_FAST_COMMIT); - if (test_opt2(sb, EXPLICIT_DELALLOC)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "both data=journal and delalloc"); - goto failed_mount; - } - if (test_opt(sb, DAX_ALWAYS)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "both data=journal and dax"); - goto failed_mount; - } - if (ext4_has_feature_encrypt(sb)) { - ext4_msg(sb, KERN_WARNING, - "encrypted files will use data=ordered " - "instead of data journaling mode"); - } - if (test_opt(sb, DELALLOC)) - clear_opt(sb, DELALLOC); - } else { - sb->s_iflags |= SB_I_CGROUPWB; + /* Load the checksum driver */ + sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(sbi->s_chksum_driver)) { + int ret = PTR_ERR(sbi->s_chksum_driver); + ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); + sbi->s_chksum_driver = NULL; + return ret; } - sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | - (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); + /* Check superblock checksum */ + if (!ext4_superblock_csum_verify(sb, es)) { + ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " + "invalid superblock checksum. Run e2fsck?"); + return -EFSBADCRC; + } + /* Precompute checksum seed for all metadata */ + if (ext4_has_feature_csum_seed(sb)) + sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); + else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) + sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, + sizeof(es->s_uuid)); + return 0; +} + +static int ext4_check_feature_compatibility(struct super_block *sb, + struct ext4_super_block *es, + int silent) +{ if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && (ext4_has_compat_features(sb) || ext4_has_ro_compat_features(sb) || @@ -4653,7 +4602,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (ext4_has_feature_64bit(sb)) { ext4_msg(sb, KERN_ERR, "The Hurd can't support 64-bit file systems"); - goto failed_mount; + return -EINVAL; } /* @@ -4663,7 +4612,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (ext4_has_feature_ea_inode(sb)) { ext4_msg(sb, KERN_ERR, "ea_inode feature is not supported for Hurd"); - goto failed_mount; + return -EINVAL; } } @@ -4677,10 +4626,10 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) * it's actually an ext[34] filesystem. */ if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) - goto failed_mount; + return -EINVAL; ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due " "to feature incompatibilities"); - goto failed_mount; + return -EINVAL; } } @@ -4694,10 +4643,10 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) * it's actually an ext4 filesystem. */ if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) - goto failed_mount; + return -EINVAL; ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due " "to feature incompatibilities"); - goto failed_mount; + return -EINVAL; } } @@ -4707,9 +4656,463 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) * so there is a chance incompat flags are set on a rev 0 filesystem. */ if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) + return -EINVAL; + + return 0; +} + +static int ext4_geometry_check(struct super_block *sb, + struct ext4_super_block *es) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + __u64 blocks_count; + + /* check blocks count against device size */ + blocks_count = sb_bdev_nr_blocks(sb); + if (blocks_count && ext4_blocks_count(es) > blocks_count) { + ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " + "exceeds size of device (%llu blocks)", + ext4_blocks_count(es), blocks_count); + return -EINVAL; + } + + /* + * It makes no sense for the first data block to be beyond the end + * of the filesystem. + */ + if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { + ext4_msg(sb, KERN_WARNING, "bad geometry: first data " + "block %u is beyond end of filesystem (%llu)", + le32_to_cpu(es->s_first_data_block), + ext4_blocks_count(es)); + return -EINVAL; + } + if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) && + (sbi->s_cluster_ratio == 1)) { + ext4_msg(sb, KERN_WARNING, "bad geometry: first data " + "block is 0 with a 1k block and cluster size"); + return -EINVAL; + } + + blocks_count = (ext4_blocks_count(es) - + le32_to_cpu(es->s_first_data_block) + + EXT4_BLOCKS_PER_GROUP(sb) - 1); + do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); + if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { + ext4_msg(sb, KERN_WARNING, "groups count too large: %llu " + "(block count %llu, first data block %u, " + "blocks per group %lu)", blocks_count, + ext4_blocks_count(es), + le32_to_cpu(es->s_first_data_block), + EXT4_BLOCKS_PER_GROUP(sb)); + return -EINVAL; + } + sbi->s_groups_count = blocks_count; + sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); + if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) != + le32_to_cpu(es->s_inodes_count)) { + ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu", + le32_to_cpu(es->s_inodes_count), + ((u64)sbi->s_groups_count * sbi->s_inodes_per_group)); + return -EINVAL; + } + + return 0; +} + +static void ext4_group_desc_free(struct ext4_sb_info *sbi) +{ + struct buffer_head **group_desc; + int i; + + rcu_read_lock(); + group_desc = rcu_dereference(sbi->s_group_desc); + for (i = 0; i < sbi->s_gdb_count; i++) + brelse(group_desc[i]); + kvfree(group_desc); + rcu_read_unlock(); +} + +static int ext4_group_desc_init(struct super_block *sb, + struct ext4_super_block *es, + ext4_fsblk_t logical_sb_block, + ext4_group_t *first_not_zeroed) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned int db_count; + ext4_fsblk_t block; + int ret; + int i; + + db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / + EXT4_DESC_PER_BLOCK(sb); + if (ext4_has_feature_meta_bg(sb)) { + if (le32_to_cpu(es->s_first_meta_bg) > db_count) { + ext4_msg(sb, KERN_WARNING, + "first meta block group too large: %u " + "(group descriptor block count %u)", + le32_to_cpu(es->s_first_meta_bg), db_count); + return -EINVAL; + } + } + rcu_assign_pointer(sbi->s_group_desc, + kvmalloc_array(db_count, + sizeof(struct buffer_head *), + GFP_KERNEL)); + if (sbi->s_group_desc == NULL) { + ext4_msg(sb, KERN_ERR, "not enough memory"); + return -ENOMEM; + } + + bgl_lock_init(sbi->s_blockgroup_lock); + + /* Pre-read the descriptors into the buffer cache */ + for (i = 0; i < db_count; i++) { + block = descriptor_loc(sb, logical_sb_block, i); + ext4_sb_breadahead_unmovable(sb, block); + } + + for (i = 0; i < db_count; i++) { + struct buffer_head *bh; + + block = descriptor_loc(sb, logical_sb_block, i); + bh = ext4_sb_bread_unmovable(sb, block); + if (IS_ERR(bh)) { + ext4_msg(sb, KERN_ERR, + "can't read group descriptor %d", i); + sbi->s_gdb_count = i; + ret = PTR_ERR(bh); + goto out; + } + rcu_read_lock(); + rcu_dereference(sbi->s_group_desc)[i] = bh; + rcu_read_unlock(); + } + sbi->s_gdb_count = db_count; + if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) { + ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); + ret = -EFSCORRUPTED; + goto out; + } + return 0; +out: + ext4_group_desc_free(sbi); + return ret; +} + +static int ext4_load_and_init_journal(struct super_block *sb, + struct ext4_super_block *es, + struct ext4_fs_context *ctx) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int err; + + err = ext4_load_journal(sb, es, ctx->journal_devnum); + if (err) + return err; + + if (ext4_has_feature_64bit(sb) && + !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_64BIT)) { + ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); + goto out; + } + + if (!set_journal_csum_feature_set(sb)) { + ext4_msg(sb, KERN_ERR, "Failed to set journal checksum " + "feature set"); + goto out; + } + + if (test_opt2(sb, JOURNAL_FAST_COMMIT) && + !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) { + ext4_msg(sb, KERN_ERR, + "Failed to set fast commit journal feature"); + goto out; + } + + /* We have now updated the journal if required, so we can + * validate the data journaling mode. */ + switch (test_opt(sb, DATA_FLAGS)) { + case 0: + /* No mode set, assume a default based on the journal + * capabilities: ORDERED_DATA if the journal can + * cope, else JOURNAL_DATA + */ + if (jbd2_journal_check_available_features + (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { + set_opt(sb, ORDERED_DATA); + sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA; + } else { + set_opt(sb, JOURNAL_DATA); + sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; + } + break; + + case EXT4_MOUNT_ORDERED_DATA: + case EXT4_MOUNT_WRITEBACK_DATA: + if (!jbd2_journal_check_available_features + (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { + ext4_msg(sb, KERN_ERR, "Journal does not support " + "requested data journaling mode"); + goto out; + } + break; + default: + break; + } + + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && + test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "journal_async_commit in data=ordered mode"); + goto out; + } + + set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio); + + sbi->s_journal->j_submit_inode_data_buffers = + ext4_journal_submit_inode_data_buffers; + sbi->s_journal->j_finish_inode_data_buffers = + ext4_journal_finish_inode_data_buffers; + + return 0; + +out: + /* flush s_error_work before journal destroy. */ + flush_work(&sbi->s_error_work); + jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; + return err; +} + +static int ext4_journal_data_mode_check(struct super_block *sb) +{ + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with " + "data=journal disables delayed allocation, " + "dioread_nolock, O_DIRECT and fast_commit support!\n"); + /* can't mount with both data=journal and dioread_nolock. */ + clear_opt(sb, DIOREAD_NOLOCK); + clear_opt2(sb, JOURNAL_FAST_COMMIT); + if (test_opt2(sb, EXPLICIT_DELALLOC)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and delalloc"); + return -EINVAL; + } + if (test_opt(sb, DAX_ALWAYS)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dax"); + return -EINVAL; + } + if (ext4_has_feature_encrypt(sb)) { + ext4_msg(sb, KERN_WARNING, + "encrypted files will use data=ordered " + "instead of data journaling mode"); + } + if (test_opt(sb, DELALLOC)) + clear_opt(sb, DELALLOC); + } else { + sb->s_iflags |= SB_I_CGROUPWB; + } + + return 0; +} + +static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, + int silent) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es; + ext4_fsblk_t logical_sb_block; + unsigned long offset = 0; + struct buffer_head *bh; + int ret = -EINVAL; + int blocksize; + + blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); + if (!blocksize) { + ext4_msg(sb, KERN_ERR, "unable to set blocksize"); + return -EINVAL; + } + + /* + * The ext4 superblock will not be buffer aligned for other than 1kB + * block sizes. We need to calculate the offset from buffer start. + */ + if (blocksize != EXT4_MIN_BLOCK_SIZE) { + logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; + offset = do_div(logical_sb_block, blocksize); + } else { + logical_sb_block = sbi->s_sb_block; + } + + bh = ext4_sb_bread_unmovable(sb, logical_sb_block); + if (IS_ERR(bh)) { + ext4_msg(sb, KERN_ERR, "unable to read superblock"); + return PTR_ERR(bh); + } + /* + * Note: s_es must be initialized as soon as possible because + * some ext4 macro-instructions depend on its value + */ + es = (struct ext4_super_block *) (bh->b_data + offset); + sbi->s_es = es; + sb->s_magic = le16_to_cpu(es->s_magic); + if (sb->s_magic != EXT4_SUPER_MAGIC) { + if (!silent) + ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); + goto out; + } + + if (le32_to_cpu(es->s_log_block_size) > + (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Invalid log block size: %u", + le32_to_cpu(es->s_log_block_size)); + goto out; + } + if (le32_to_cpu(es->s_log_cluster_size) > + (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Invalid log cluster size: %u", + le32_to_cpu(es->s_log_cluster_size)); + goto out; + } + + blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + + /* + * If the default block size is not the same as the real block size, + * we need to reload it. + */ + if (sb->s_blocksize == blocksize) { + *lsb = logical_sb_block; + sbi->s_sbh = bh; + return 0; + } + + /* + * bh must be released before kill_bdev(), otherwise + * it won't be freed and its page also. kill_bdev() + * is called by sb_set_blocksize(). + */ + brelse(bh); + /* Validate the filesystem blocksize */ + if (!sb_set_blocksize(sb, blocksize)) { + ext4_msg(sb, KERN_ERR, "bad block size %d", + blocksize); + bh = NULL; + goto out; + } + + logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; + offset = do_div(logical_sb_block, blocksize); + bh = ext4_sb_bread_unmovable(sb, logical_sb_block); + if (IS_ERR(bh)) { + ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try"); + ret = PTR_ERR(bh); + bh = NULL; + goto out; + } + es = (struct ext4_super_block *)(bh->b_data + offset); + sbi->s_es = es; + if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { + ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!"); + goto out; + } + *lsb = logical_sb_block; + sbi->s_sbh = bh; + return 0; +out: + brelse(bh); + return ret; +} + +static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) +{ + struct ext4_super_block *es = NULL; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct flex_groups **flex_groups; + ext4_fsblk_t block; + ext4_fsblk_t logical_sb_block; + struct inode *root; + int ret = -ENOMEM; + unsigned int i; + int needs_recovery, has_huge_files; + int err = 0; + ext4_group_t first_not_zeroed; + struct ext4_fs_context *ctx = fc->fs_private; + int silent = fc->sb_flags & SB_SILENT; + + /* Set defaults for the variables that will be set during parsing */ + if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) + ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + + sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; + sbi->s_sectors_written_start = + part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); + + /* -EINVAL is default */ + ret = -EINVAL; + err = ext4_load_super(sb, &logical_sb_block, silent); + if (err) + goto out_fail; + + es = sbi->s_es; + sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); + + err = ext4_init_metadata_csum(sb, es); + if (err) + goto failed_mount; + + ext4_set_def_opts(sb, es); + + sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); + sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); + sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; + sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; + sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; + + /* + * set default s_li_wait_mult for lazyinit, for the case there is + * no mount option specified. + */ + sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; + + if (ext4_inode_info_init(sb, es)) + goto failed_mount; + + err = parse_apply_sb_mount_options(sb, ctx); + if (err < 0) + goto failed_mount; + + sbi->s_def_mount_opt = sbi->s_mount_opt; + + err = ext4_check_opt_consistency(fc, sb); + if (err < 0) goto failed_mount; - if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) { + ext4_apply_options(fc, sb); + + if (ext4_encoding_init(sb, es)) + goto failed_mount; + + if (ext4_journal_data_mode_check(sb)) + goto failed_mount; + + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); + + /* i_version is always enabled now */ + sb->s_flags |= SB_I_VERSION; + + if (ext4_check_feature_compatibility(sb, es, silent)) + goto failed_mount; + + if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) { ext4_msg(sb, KERN_ERR, "Number of reserved GDT blocks insanely large: %d", le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); @@ -4717,7 +5120,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) } if (sbi->s_daxdev) { - if (blocksize == PAGE_SIZE) + if (sb->s_blocksize == PAGE_SIZE) set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); else ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); @@ -4742,40 +5145,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount; } - if (sb->s_blocksize != blocksize) { - /* - * bh must be released before kill_bdev(), otherwise - * it won't be freed and its page also. kill_bdev() - * is called by sb_set_blocksize(). - */ - brelse(bh); - /* Validate the filesystem blocksize */ - if (!sb_set_blocksize(sb, blocksize)) { - ext4_msg(sb, KERN_ERR, "bad block size %d", - blocksize); - bh = NULL; - goto failed_mount; - } - - logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; - offset = do_div(logical_sb_block, blocksize); - bh = ext4_sb_bread_unmovable(sb, logical_sb_block); - if (IS_ERR(bh)) { - ext4_msg(sb, KERN_ERR, - "Can't read superblock on 2nd try"); - ret = PTR_ERR(bh); - bh = NULL; - goto failed_mount; - } - es = (struct ext4_super_block *)(bh->b_data + offset); - sbi->s_es = es; - if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { - ext4_msg(sb, KERN_ERR, - "Magic mismatch, very weird!"); - goto failed_mount; - } - } - has_huge_files = ext4_has_feature_huge_file(sb); sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, has_huge_files); @@ -4797,19 +5166,21 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); - sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); - if (sbi->s_inodes_per_block == 0) - goto cantfind_ext4; + sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb); + if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) { + if (!silent) + ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); + goto failed_mount; + } if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || - sbi->s_inodes_per_group > blocksize * 8) { + sbi->s_inodes_per_group > sb->s_blocksize * 8) { ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", sbi->s_inodes_per_group); goto failed_mount; } sbi->s_itb_per_group = sbi->s_inodes_per_group / sbi->s_inodes_per_block; - sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb); - sbi->s_sbh = bh; + sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb); sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY; sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); @@ -4835,54 +5206,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) } } - /* Handle clustersize */ - clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); - if (ext4_has_feature_bigalloc(sb)) { - if (clustersize < blocksize) { - ext4_msg(sb, KERN_ERR, - "cluster size (%d) smaller than " - "block size (%d)", clustersize, blocksize); - goto failed_mount; - } - sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - - le32_to_cpu(es->s_log_block_size); - sbi->s_clusters_per_group = - le32_to_cpu(es->s_clusters_per_group); - if (sbi->s_clusters_per_group > blocksize * 8) { - ext4_msg(sb, KERN_ERR, - "#clusters per group too big: %lu", - sbi->s_clusters_per_group); - goto failed_mount; - } - if (sbi->s_blocks_per_group != - (sbi->s_clusters_per_group * (clustersize / blocksize))) { - ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and " - "clusters per group (%lu) inconsistent", - sbi->s_blocks_per_group, - sbi->s_clusters_per_group); - goto failed_mount; - } - } else { - if (clustersize != blocksize) { - ext4_msg(sb, KERN_ERR, - "fragment/cluster size (%d) != " - "block size (%d)", clustersize, blocksize); - goto failed_mount; - } - if (sbi->s_blocks_per_group > blocksize * 8) { - ext4_msg(sb, KERN_ERR, - "#blocks per group too big: %lu", - sbi->s_blocks_per_group); - goto failed_mount; - } - sbi->s_clusters_per_group = sbi->s_blocks_per_group; - sbi->s_cluster_bits = 0; - } - sbi->s_cluster_ratio = clustersize / blocksize; - - /* Do we have standard group size of clustersize * 8 blocks ? */ - if (sbi->s_blocks_per_group == clustersize << 3) - set_opt2(sb, STD_GROUP_SIZE); + if (ext4_handle_clustersize(sb)) + goto failed_mount; /* * Test whether we have more sectors than will fit in sector_t, @@ -4896,111 +5221,12 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount; } - if (EXT4_BLOCKS_PER_GROUP(sb) == 0) - goto cantfind_ext4; - - /* check blocks count against device size */ - blocks_count = sb_bdev_nr_blocks(sb); - if (blocks_count && ext4_blocks_count(es) > blocks_count) { - ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " - "exceeds size of device (%llu blocks)", - ext4_blocks_count(es), blocks_count); + if (ext4_geometry_check(sb, es)) goto failed_mount; - } - /* - * It makes no sense for the first data block to be beyond the end - * of the filesystem. - */ - if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { - ext4_msg(sb, KERN_WARNING, "bad geometry: first data " - "block %u is beyond end of filesystem (%llu)", - le32_to_cpu(es->s_first_data_block), - ext4_blocks_count(es)); - goto failed_mount; - } - if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) && - (sbi->s_cluster_ratio == 1)) { - ext4_msg(sb, KERN_WARNING, "bad geometry: first data " - "block is 0 with a 1k block and cluster size"); - goto failed_mount; - } - - blocks_count = (ext4_blocks_count(es) - - le32_to_cpu(es->s_first_data_block) + - EXT4_BLOCKS_PER_GROUP(sb) - 1); - do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); - if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { - ext4_msg(sb, KERN_WARNING, "groups count too large: %llu " - "(block count %llu, first data block %u, " - "blocks per group %lu)", blocks_count, - ext4_blocks_count(es), - le32_to_cpu(es->s_first_data_block), - EXT4_BLOCKS_PER_GROUP(sb)); - goto failed_mount; - } - sbi->s_groups_count = blocks_count; - sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, - (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); - if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) != - le32_to_cpu(es->s_inodes_count)) { - ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu", - le32_to_cpu(es->s_inodes_count), - ((u64)sbi->s_groups_count * sbi->s_inodes_per_group)); - ret = -EINVAL; - goto failed_mount; - } - db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / - EXT4_DESC_PER_BLOCK(sb); - if (ext4_has_feature_meta_bg(sb)) { - if (le32_to_cpu(es->s_first_meta_bg) > db_count) { - ext4_msg(sb, KERN_WARNING, - "first meta block group too large: %u " - "(group descriptor block count %u)", - le32_to_cpu(es->s_first_meta_bg), db_count); - goto failed_mount; - } - } - rcu_assign_pointer(sbi->s_group_desc, - kvmalloc_array(db_count, - sizeof(struct buffer_head *), - GFP_KERNEL)); - if (sbi->s_group_desc == NULL) { - ext4_msg(sb, KERN_ERR, "not enough memory"); - ret = -ENOMEM; + err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); + if (err) goto failed_mount; - } - - bgl_lock_init(sbi->s_blockgroup_lock); - - /* Pre-read the descriptors into the buffer cache */ - for (i = 0; i < db_count; i++) { - block = descriptor_loc(sb, logical_sb_block, i); - ext4_sb_breadahead_unmovable(sb, block); - } - - for (i = 0; i < db_count; i++) { - struct buffer_head *bh; - - block = descriptor_loc(sb, logical_sb_block, i); - bh = ext4_sb_bread_unmovable(sb, block); - if (IS_ERR(bh)) { - ext4_msg(sb, KERN_ERR, - "can't read group descriptor %d", i); - db_count = i; - ret = PTR_ERR(bh); - goto failed_mount2; - } - rcu_read_lock(); - rcu_dereference(sbi->s_group_desc)[i] = bh; - rcu_read_unlock(); - } - sbi->s_gdb_count = db_count; - if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) { - ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); - ret = -EFSCORRUPTED; - goto failed_mount2; - } timer_setup(&sbi->s_err_report, print_daily_error_info, 0); spin_lock_init(&sbi->s_error_lock); @@ -5038,24 +5264,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); - /* Initialize fast commit stuff */ - atomic_set(&sbi->s_fc_subtid, 0); - INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]); - INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]); - INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); - INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); - sbi->s_fc_bytes = 0; - ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); - sbi->s_fc_ineligible_tid = 0; - spin_lock_init(&sbi->s_fc_lock); - memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); - sbi->s_fc_replay_state.fc_regions = NULL; - sbi->s_fc_replay_state.fc_regions_size = 0; - sbi->s_fc_replay_state.fc_regions_used = 0; - sbi->s_fc_replay_state.fc_regions_valid = 0; - sbi->s_fc_replay_state.fc_modified_inodes = NULL; - sbi->s_fc_replay_state.fc_modified_inodes_size = 0; - sbi->s_fc_replay_state.fc_modified_inodes_used = 0; + ext4_fast_commit_init(sb); sb->s_root = NULL; @@ -5072,37 +5281,37 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) * root first: it may be modified in the journal! */ if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { - err = ext4_load_journal(sb, es, ctx->journal_devnum); + err = ext4_load_and_init_journal(sb, es, ctx); if (err) goto failed_mount3a; } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && ext4_has_feature_journal_needs_recovery(sb)) { ext4_msg(sb, KERN_ERR, "required journal recovery " "suppressed and not mounted read-only"); - goto failed_mount_wq; + goto failed_mount3a; } else { /* Nojournal mode, all journal mount options are illegal */ if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_checksum, fs mounted w/o journal"); - goto failed_mount_wq; + goto failed_mount3a; } if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_async_commit, fs mounted w/o journal"); - goto failed_mount_wq; + goto failed_mount3a; } if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { ext4_msg(sb, KERN_ERR, "can't mount with " "commit=%lu, fs mounted w/o journal", sbi->s_commit_interval / HZ); - goto failed_mount_wq; + goto failed_mount3a; } if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { ext4_msg(sb, KERN_ERR, "can't mount with " "data=, fs mounted w/o journal"); - goto failed_mount_wq; + goto failed_mount3a; } sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; clear_opt(sb, JOURNAL_CHECKSUM); @@ -5110,76 +5319,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) clear_opt2(sb, JOURNAL_FAST_COMMIT); sbi->s_journal = NULL; needs_recovery = 0; - goto no_journal; - } - - if (ext4_has_feature_64bit(sb) && - !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, - JBD2_FEATURE_INCOMPAT_64BIT)) { - ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); - goto failed_mount_wq; - } - - if (!set_journal_csum_feature_set(sb)) { - ext4_msg(sb, KERN_ERR, "Failed to set journal checksum " - "feature set"); - goto failed_mount_wq; - } - - if (test_opt2(sb, JOURNAL_FAST_COMMIT) && - !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, - JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) { - ext4_msg(sb, KERN_ERR, - "Failed to set fast commit journal feature"); - goto failed_mount_wq; - } - - /* We have now updated the journal if required, so we can - * validate the data journaling mode. */ - switch (test_opt(sb, DATA_FLAGS)) { - case 0: - /* No mode set, assume a default based on the journal - * capabilities: ORDERED_DATA if the journal can - * cope, else JOURNAL_DATA - */ - if (jbd2_journal_check_available_features - (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { - set_opt(sb, ORDERED_DATA); - sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA; - } else { - set_opt(sb, JOURNAL_DATA); - sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; - } - break; - - case EXT4_MOUNT_ORDERED_DATA: - case EXT4_MOUNT_WRITEBACK_DATA: - if (!jbd2_journal_check_available_features - (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { - ext4_msg(sb, KERN_ERR, "Journal does not support " - "requested data journaling mode"); - goto failed_mount_wq; - } - break; - default: - break; - } - - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && - test_opt(sb, JOURNAL_ASYNC_COMMIT)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "journal_async_commit in data=ordered mode"); - goto failed_mount_wq; } - set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio); - - sbi->s_journal->j_submit_inode_data_buffers = - ext4_journal_submit_inode_data_buffers; - sbi->s_journal->j_finish_inode_data_buffers = - ext4_journal_finish_inode_data_buffers; - -no_journal: if (!test_opt(sb, NO_MBCACHE)) { sbi->s_ea_block_cache = ext4_xattr_create_cache(); if (!sbi->s_ea_block_cache) { @@ -5198,7 +5339,7 @@ no_journal: } } - if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) { + if (ext4_has_feature_verity(sb) && sb->s_blocksize != PAGE_SIZE) { ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity"); goto failed_mount_wq; } @@ -5408,11 +5549,6 @@ no_journal: return 0; -cantfind_ext4: - if (!silent) - ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); - goto failed_mount; - failed_mount9: ext4_release_orphan_info(sb); failed_mount8: @@ -5466,13 +5602,7 @@ failed_mount3: flush_work(&sbi->s_error_work); del_timer_sync(&sbi->s_err_report); ext4_stop_mmpd(sbi); -failed_mount2: - rcu_read_lock(); - group_desc = rcu_dereference(sbi->s_group_desc); - for (i = 0; i < db_count; i++) - brelse(group_desc[i]); - kvfree(group_desc); - rcu_read_unlock(); + ext4_group_desc_free(sbi); failed_mount: if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); @@ -5487,7 +5617,7 @@ failed_mount: #endif fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); /* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */ - brelse(bh); + brelse(sbi->s_sbh); ext4_blkdev_remove(sbi); out_fail: sb->s_fs_info = NULL; @@ -6653,7 +6783,7 @@ static int ext4_write_info(struct super_block *sb, int type) handle_t *handle; /* Data block + inode block */ - handle = ext4_journal_start(d_inode(sb->s_root), EXT4_HT_QUOTA, 2); + handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit_info(sb, type); diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index b051d19b5c8a..3c640bd7ecae 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -298,16 +298,14 @@ static int ext4_get_verity_descriptor_location(struct inode *inode, last_extent = path[path->p_depth].p_ext; if (!last_extent) { EXT4_ERROR_INODE(inode, "verity file has no extents"); - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); return -EFSCORRUPTED; } end_lblk = le32_to_cpu(last_extent->ee_block) + ext4_ext_get_actual_len(last_extent); desc_size_pos = (u64)end_lblk << inode->i_blkbits; - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); if (desc_size_pos < sizeof(desc_size_disk)) goto bad; @@ -365,13 +363,14 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { - DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); struct page *page; index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); if (!page || !PageUptodate(page)) { + DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); + if (page) put_page(page); else if (num_ra_pages > 1) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 533216e80fa2..36d6ba7190b6 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2412,6 +2412,7 @@ retry_inode: if (!error) { ext4_xattr_update_super_block(handle, inode->i_sb); inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); if (!value) no_expand = 0; error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); |