summaryrefslogtreecommitdiff
path: root/fs/ext4
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-07-19 03:03:42 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2024-07-19 03:03:42 +0300
commit51ed42a8a135511f6d6f75b56e85e6586a06a93c (patch)
tree7cc371694ac7d6b8649a000c482a51cc3fee28e4 /fs/ext4
parentdddebdece62ead1ac1112e6df375f56a1cb45f84 (diff)
parentf9ca51596bbfd0f9c386dd1c613c394c78d9e5e6 (diff)
downloadlinux-51ed42a8a135511f6d6f75b56e85e6586a06a93c.tar.xz
Merge tag 'ext4_for_linus-6.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o: "Many cleanups and bug fixes in ext4, especially for the fast commit feature. Also some performance improvements; in particular, improving IOPS and throughput on fast devices running Async Direct I/O by up to 20% by optimizing jbd2_transaction_committed()" * tag 'ext4_for_linus-6.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (40 commits) ext4: make sure the first directory block is not a hole ext4: check dot and dotdot of dx_root before making dir indexed ext4: sanity check for NULL pointer after ext4_force_shutdown jbd2: increase maximum transaction size jbd2: drop pointless shrinker batch initialization jbd2: avoid infinite transaction commit loop jbd2: precompute number of transaction descriptor blocks jbd2: make jbd2_journal_get_max_txn_bufs() internal jbd2: avoid mount failed when commit block is partial submitted ext4: avoid writing unitialized memory to disk in EA inodes ext4: don't track ranges in fast_commit if inode has inlined data ext4: fix possible tid_t sequence overflows ext4: use ext4_update_inode_fsync_trans() helper in inode creation ext4: add missing MODULE_DESCRIPTION() jbd2: add missing MODULE_DESCRIPTION() ext4: use memtostr_pad() for s_volume_name jbd2: speed up jbd2_transaction_committed() ext4: make ext4_da_map_blocks() buffer_head unaware ext4: make ext4_insert_delayed_block() insert multi-blocks ext4: factor out a helper to check the cluster allocation state ...
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/block_validity.c2
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/extents_status.c72
-rw-r--r--fs/ext4/extents_status.h5
-rw-r--r--fs/ext4/fast_commit.c14
-rw-r--r--fs/ext4/ialloc.c5
-rw-r--r--fs/ext4/inline.c6
-rw-r--r--fs/ext4/inode-test.c1
-rw-r--r--fs/ext4/inode.c255
-rw-r--r--fs/ext4/ioctl.c2
-rw-r--r--fs/ext4/namei.c73
-rw-r--r--fs/ext4/super.c6
-rw-r--r--fs/ext4/xattr.c6
13 files changed, 309 insertions, 140 deletions
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 6fe3c941b565..87ee3a17bd29 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -72,7 +72,7 @@ static int add_system_zone(struct ext4_system_blocks *system_blks,
{
struct ext4_system_zone *new_entry, *entry;
struct rb_node **n = &system_blks->root.rb_node, *node;
- struct rb_node *parent = NULL, *new_node = NULL;
+ struct rb_node *parent = NULL, *new_node;
while (*n) {
parent = *n;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8007abd4972d..08acd152261e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1347,7 +1347,7 @@ struct ext4_super_block {
/*60*/ __le32 s_feature_incompat; /* incompatible feature set */
__le32 s_feature_ro_compat; /* readonly-compatible feature set */
/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */
-/*78*/ char s_volume_name[EXT4_LABEL_MAX]; /* volume name */
+/*78*/ char s_volume_name[EXT4_LABEL_MAX] __nonstring; /* volume name */
/*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */
/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */
/*
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 4a00e2f019d9..17dcf13adde2 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -310,6 +310,8 @@ void ext4_es_find_extent_range(struct inode *inode,
ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es)
{
+ es->es_lblk = es->es_len = es->es_pblk = 0;
+
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
@@ -2052,34 +2054,49 @@ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
}
/*
- * ext4_es_insert_delayed_block - adds a delayed block to the extents status
- * tree, adding a pending reservation where
- * needed
+ * ext4_es_insert_delayed_extent - adds some delayed blocks to the extents
+ * status tree, adding a pending reservation
+ * where needed
*
* @inode - file containing the newly added block
- * @lblk - logical block to be added
- * @allocated - indicates whether a physical cluster has been allocated for
- * the logical cluster that contains the block
+ * @lblk - start logical block to be added
+ * @len - length of blocks to be added
+ * @lclu_allocated/end_allocated - indicates whether a physical cluster has
+ * been allocated for the logical cluster
+ * that contains the start/end block. Note that
+ * end_allocated should always be set to false
+ * if the start and the end block are in the
+ * same cluster
*/
-void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
- bool allocated)
+void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, bool lclu_allocated,
+ bool end_allocated)
{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct extent_status newes;
+ ext4_lblk_t end = lblk + len - 1;
int err1 = 0, err2 = 0, err3 = 0;
struct extent_status *es1 = NULL;
struct extent_status *es2 = NULL;
- struct pending_reservation *pr = NULL;
+ struct pending_reservation *pr1 = NULL;
+ struct pending_reservation *pr2 = NULL;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
- es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
- lblk, inode->i_ino);
+ es_debug("add [%u/%u) delayed to extent status tree of inode %lu\n",
+ lblk, len, inode->i_ino);
+ if (!len)
+ return;
+
+ WARN_ON_ONCE((EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) &&
+ end_allocated);
newes.es_lblk = lblk;
- newes.es_len = 1;
+ newes.es_len = len;
ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
- trace_ext4_es_insert_delayed_block(inode, &newes, allocated);
+ trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated,
+ end_allocated);
ext4_es_insert_extent_check(inode, &newes);
@@ -2088,11 +2105,15 @@ retry:
es1 = __es_alloc_extent(true);
if ((err1 || err2) && !es2)
es2 = __es_alloc_extent(true);
- if ((err1 || err2 || err3) && allocated && !pr)
- pr = __alloc_pending(true);
+ if (err1 || err2 || err3) {
+ if (lclu_allocated && !pr1)
+ pr1 = __alloc_pending(true);
+ if (end_allocated && !pr2)
+ pr2 = __alloc_pending(true);
+ }
write_lock(&EXT4_I(inode)->i_es_lock);
- err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1);
+ err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
if (err1 != 0)
goto error;
/* Free preallocated extent if it didn't get used. */
@@ -2112,13 +2133,22 @@ retry:
es2 = NULL;
}
- if (allocated) {
- err3 = __insert_pending(inode, lblk, &pr);
+ if (lclu_allocated) {
+ err3 = __insert_pending(inode, lblk, &pr1);
if (err3 != 0)
goto error;
- if (pr) {
- __free_pending(pr);
- pr = NULL;
+ if (pr1) {
+ __free_pending(pr1);
+ pr1 = NULL;
+ }
+ }
+ if (end_allocated) {
+ err3 = __insert_pending(inode, end, &pr2);
+ if (err3 != 0)
+ goto error;
+ if (pr2) {
+ __free_pending(pr2);
+ pr2 = NULL;
}
}
error:
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index d9847a4a25db..3c8e2edee5d5 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -249,8 +249,9 @@ extern void ext4_exit_pending(void);
extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
-extern void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
- bool allocated);
+extern void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, bool lclu_allocated,
+ bool end_allocated);
extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
extern void ext4_clear_inode_es(struct inode *inode);
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 87c009e0c59a..3926a05eceee 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -353,7 +353,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl
read_unlock(&sbi->s_journal->j_state_lock);
}
spin_lock(&sbi->s_fc_lock);
- if (sbi->s_fc_ineligible_tid < tid)
+ if (tid_gt(tid, sbi->s_fc_ineligible_tid))
sbi->s_fc_ineligible_tid = tid;
spin_unlock(&sbi->s_fc_lock);
WARN_ON(reason >= EXT4_FC_REASON_MAX);
@@ -649,6 +649,12 @@ void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t star
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
return;
+ if (ext4_has_inline_data(inode)) {
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR,
+ handle);
+ return;
+ }
+
args.start = start;
args.end = end;
@@ -1207,7 +1213,7 @@ restart_fc:
if (ret == -EALREADY) {
/* There was an ongoing commit, check if we need to restart */
if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
- commit_tid > journal->j_commit_sequence)
+ tid_gt(commit_tid, journal->j_commit_sequence))
goto restart_fc;
ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
commit_tid);
@@ -1282,7 +1288,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
list_del_init(&iter->i_fc_list);
ext4_clear_inode_state(&iter->vfs_inode,
EXT4_STATE_FC_COMMITTING);
- if (iter->i_sync_tid <= tid)
+ if (tid_geq(tid, iter->i_sync_tid))
ext4_fc_reset_inode(&iter->vfs_inode);
/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
smp_mb();
@@ -1313,7 +1319,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
&sbi->s_fc_q[FC_Q_MAIN]);
- if (tid >= sbi->s_fc_ineligible_tid) {
+ if (tid_geq(tid, sbi->s_fc_ineligible_tid)) {
sbi->s_fc_ineligible_tid = 0;
ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index e9bbb1da2d0a..9dfd768ed9f8 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1336,10 +1336,7 @@ got:
}
}
- if (ext4_handle_valid(handle)) {
- ei->i_sync_tid = handle->h_transaction->t_tid;
- ei->i_datasync_tid = handle->h_transaction->t_tid;
- }
+ ext4_update_inode_fsync_trans(handle, inode, 1);
err = ext4_mark_inode_dirty(handle, inode);
if (err) {
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index d5bd1e3a5d36..e7a09a99837b 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1410,7 +1410,11 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
hinfo->hash = EXT4_DIRENT_HASH(de);
hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de);
} else {
- ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
+ err = ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
+ if (err) {
+ ret = err;
+ goto out;
+ }
}
if ((hinfo->hash < start_hash) ||
((hinfo->hash == start_hash) &&
diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c
index f0c0fd507fbc..749af7ad4e09 100644
--- a/fs/ext4/inode-test.c
+++ b/fs/ext4/inode-test.c
@@ -279,4 +279,5 @@ static struct kunit_suite ext4_inode_test_suite = {
kunit_test_suites(&ext4_inode_test_suite);
+MODULE_DESCRIPTION("KUnit test of ext4 inode timestamp decoding");
MODULE_LICENSE("GPL v2");
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4bae9ccf5fe0..941c1c0d5c6e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -453,6 +453,35 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
}
#endif /* ES_AGGRESSIVE_TEST */
+static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map)
+{
+ unsigned int status;
+ int retval;
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ retval = ext4_ext_map_blocks(handle, inode, map, 0);
+ else
+ retval = ext4_ind_map_blocks(handle, inode, map, 0);
+
+ if (retval <= 0)
+ return retval;
+
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
+ }
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
+ return retval;
+}
+
/*
* The ext4_map_blocks() function tries to look up the requested blocks,
* and returns if the blocks are already mapped.
@@ -1450,9 +1479,9 @@ static int ext4_journalled_write_end(struct file *file,
}
/*
- * Reserve space for a single cluster
+ * Reserve space for 'nr_resv' clusters
*/
-static int ext4_da_reserve_space(struct inode *inode)
+static int ext4_da_reserve_space(struct inode *inode, int nr_resv)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -1463,18 +1492,18 @@ static int ext4_da_reserve_space(struct inode *inode)
* us from metadata over-estimation, though we may go over by
* a small amount in the end. Here we just reserve for data.
*/
- ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
+ ret = dquot_reserve_block(inode, EXT4_C2B(sbi, nr_resv));
if (ret)
return ret;
spin_lock(&ei->i_block_reservation_lock);
- if (ext4_claim_free_clusters(sbi, 1, 0)) {
+ if (ext4_claim_free_clusters(sbi, nr_resv, 0)) {
spin_unlock(&ei->i_block_reservation_lock);
- dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, nr_resv));
return -ENOSPC;
}
- ei->i_reserved_data_blocks++;
- trace_ext4_da_reserve_space(inode);
+ ei->i_reserved_data_blocks += nr_resv;
+ trace_ext4_da_reserve_space(inode, nr_resv);
spin_unlock(&ei->i_block_reservation_lock);
return 0; /* success */
@@ -1621,24 +1650,58 @@ static void ext4_print_free_blocks(struct inode *inode)
}
/*
- * ext4_insert_delayed_block - adds a delayed block to the extents status
- * tree, incrementing the reserved cluster/block
- * count or making a pending reservation
- * where needed
+ * Check whether the cluster containing lblk has been allocated or has
+ * delalloc reservation.
+ *
+ * Returns 0 if the cluster doesn't have either, 1 if it has delalloc
+ * reservation, 2 if it's already been allocated, negative error code on
+ * failure.
+ */
+static int ext4_clu_alloc_state(struct inode *inode, ext4_lblk_t lblk)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int ret;
+
+ /* Has delalloc reservation? */
+ if (ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk))
+ return 1;
+
+ /* Already been allocated? */
+ if (ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk))
+ return 2;
+ ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk));
+ if (ret < 0)
+ return ret;
+ if (ret > 0)
+ return 2;
+
+ return 0;
+}
+
+/*
+ * ext4_insert_delayed_blocks - adds a multiple delayed blocks to the extents
+ * status tree, incrementing the reserved
+ * cluster/block count or making pending
+ * reservations where needed
*
* @inode - file containing the newly added block
- * @lblk - logical block to be added
+ * @lblk - start logical block to be added
+ * @len - length of blocks to be added
*
* Returns 0 on success, negative error code on failure.
*/
-static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
+static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int ret;
- bool allocated = false;
+ bool lclu_allocated = false;
+ bool end_allocated = false;
+ ext4_lblk_t resv_clu;
+ ext4_lblk_t end = lblk + len - 1;
/*
- * If the cluster containing lblk is shared with a delayed,
+ * If the cluster containing lblk or end is shared with a delayed,
* written, or unwritten extent in a bigalloc file system, it's
* already been accounted for and does not need to be reserved.
* A pending reservation must be made for the cluster if it's
@@ -1649,81 +1712,84 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
* extents status tree doesn't get a match.
*/
if (sbi->s_cluster_ratio == 1) {
- ret = ext4_da_reserve_space(inode);
+ ret = ext4_da_reserve_space(inode, len);
if (ret != 0) /* ENOSPC */
return ret;
} else { /* bigalloc */
- if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
- if (!ext4_es_scan_clu(inode,
- &ext4_es_is_mapped, lblk)) {
- ret = ext4_clu_mapped(inode,
- EXT4_B2C(sbi, lblk));
- if (ret < 0)
- return ret;
- if (ret == 0) {
- ret = ext4_da_reserve_space(inode);
- if (ret != 0) /* ENOSPC */
- return ret;
- } else {
- allocated = true;
- }
- } else {
- allocated = true;
+ resv_clu = EXT4_B2C(sbi, end) - EXT4_B2C(sbi, lblk) + 1;
+
+ ret = ext4_clu_alloc_state(inode, lblk);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ resv_clu--;
+ lclu_allocated = (ret == 2);
+ }
+
+ if (EXT4_B2C(sbi, lblk) != EXT4_B2C(sbi, end)) {
+ ret = ext4_clu_alloc_state(inode, end);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ resv_clu--;
+ end_allocated = (ret == 2);
}
}
+
+ if (resv_clu) {
+ ret = ext4_da_reserve_space(inode, resv_clu);
+ if (ret != 0) /* ENOSPC */
+ return ret;
+ }
}
- ext4_es_insert_delayed_block(inode, lblk, allocated);
+ ext4_es_insert_delayed_extent(inode, lblk, len, lclu_allocated,
+ end_allocated);
return 0;
}
/*
- * This function is grabs code from the very beginning of
- * ext4_map_blocks, but assumes that the caller is from delayed write
- * time. This function looks up the requested blocks and sets the
- * buffer delay bit under the protection of i_data_sem.
+ * Looks up the requested blocks and sets the delalloc extent map.
+ * First try to look up for the extent entry that contains the requested
+ * blocks in the extent status tree without i_data_sem, then try to look
+ * up for the ondisk extent mapping with i_data_sem in read mode,
+ * finally hold i_data_sem in write mode, looks up again and add a
+ * delalloc extent entry if it still couldn't find any extent. Pass out
+ * the mapped extent through @map and return 0 on success.
*/
-static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
- struct ext4_map_blocks *map,
- struct buffer_head *bh)
+static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
{
struct extent_status es;
int retval;
- sector_t invalid_block = ~((sector_t) 0xffff);
#ifdef ES_AGGRESSIVE_TEST
struct ext4_map_blocks orig_map;
memcpy(&orig_map, map, sizeof(*map));
#endif
- if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
- invalid_block = ~0;
-
map->m_flags = 0;
ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len,
(unsigned long) map->m_lblk);
/* Lookup extent status tree firstly */
- if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
+ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+ map->m_len = min_t(unsigned int, map->m_len,
+ es.es_len - (map->m_lblk - es.es_lblk));
+
if (ext4_es_is_hole(&es))
goto add_delayed;
+found:
/*
* Delayed extent could be allocated by fallocate.
* So we need to check it.
*/
- if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
- map_bh(bh, inode->i_sb, invalid_block);
- set_buffer_new(bh);
- set_buffer_delay(bh);
+ if (ext4_es_is_delonly(&es)) {
+ map->m_flags |= EXT4_MAP_DELAYED;
return 0;
}
- map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
- retval = es.es_len - (iblock - es.es_lblk);
- if (retval > map->m_len)
- retval = map->m_len;
- map->m_len = retval;
+ map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk;
if (ext4_es_is_written(&es))
map->m_flags |= EXT4_MAP_MAPPED;
else if (ext4_es_is_unwritten(&es))
@@ -1734,7 +1800,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
#ifdef ES_AGGRESSIVE_TEST
ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
#endif
- return retval;
+ return 0;
}
/*
@@ -1744,44 +1810,41 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
down_read(&EXT4_I(inode)->i_data_sem);
if (ext4_has_inline_data(inode))
retval = 0;
- else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- retval = ext4_ext_map_blocks(NULL, inode, map, 0);
else
- retval = ext4_ind_map_blocks(NULL, inode, map, 0);
- if (retval < 0) {
- up_read(&EXT4_I(inode)->i_data_sem);
- return retval;
- }
- if (retval > 0) {
- unsigned int status;
-
- if (unlikely(retval != map->m_len)) {
- ext4_warning(inode->i_sb,
- "ES len assertion failed for inode "
- "%lu: retval %d != map->m_len %d",
- inode->i_ino, retval, map->m_len);
- WARN_ON(1);
- }
-
- status = map->m_flags & EXT4_MAP_UNWRITTEN ?
- EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status);
- up_read(&EXT4_I(inode)->i_data_sem);
- return retval;
- }
+ retval = ext4_map_query_blocks(NULL, inode, map);
up_read(&EXT4_I(inode)->i_data_sem);
+ if (retval)
+ return retval < 0 ? retval : 0;
add_delayed:
down_write(&EXT4_I(inode)->i_data_sem);
- retval = ext4_insert_delayed_block(inode, map->m_lblk);
+ /*
+ * Page fault path (ext4_page_mkwrite does not take i_rwsem)
+ * and fallocate path (no folio lock) can race. Make sure we
+ * lookup the extent status tree here again while i_data_sem
+ * is held in write mode, before inserting a new da entry in
+ * the extent status tree.
+ */
+ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+ map->m_len = min_t(unsigned int, map->m_len,
+ es.es_len - (map->m_lblk - es.es_lblk));
+
+ if (!ext4_es_is_hole(&es)) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto found;
+ }
+ } else if (!ext4_has_inline_data(inode)) {
+ retval = ext4_map_query_blocks(NULL, inode, map);
+ if (retval) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ return retval < 0 ? retval : 0;
+ }
+ }
+
+ map->m_flags |= EXT4_MAP_DELAYED;
+ retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len);
up_write(&EXT4_I(inode)->i_data_sem);
- if (retval)
- return retval;
- map_bh(bh, inode->i_sb, invalid_block);
- set_buffer_new(bh);
- set_buffer_delay(bh);
return retval;
}
@@ -1801,11 +1864,15 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)
{
struct ext4_map_blocks map;
+ sector_t invalid_block = ~((sector_t) 0xffff);
int ret = 0;
BUG_ON(create == 0);
BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
+ if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
+ invalid_block = ~0;
+
map.m_lblk = iblock;
map.m_len = 1;
@@ -1814,10 +1881,17 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
* preallocated blocks are unmapped but should treated
* the same as allocated blocks.
*/
- ret = ext4_da_map_blocks(inode, iblock, &map, bh);
- if (ret <= 0)
+ ret = ext4_da_map_blocks(inode, &map);
+ if (ret < 0)
return ret;
+ if (map.m_flags & EXT4_MAP_DELAYED) {
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
+ return 0;
+ }
+
map_bh(bh, inode->i_sb, map.m_pblk);
ext4_update_bh_state(bh, map.m_flags);
@@ -2945,6 +3019,11 @@ static int ext4_da_do_write_end(struct address_space *mapping,
bool disksize_changed = false;
loff_t new_i_size;
+ if (unlikely(!folio_buffers(folio))) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return -EIO;
+ }
/*
* block_write_end() will mark the inode as dirty with I_DIRTY_PAGES
* flag, which all that's needed to trigger page writeback.
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index dab7acd49709..e8bf5972dd47 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1151,7 +1151,7 @@ static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label
BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX);
lock_buffer(sbi->s_sbh);
- strscpy_pad(label, sbi->s_es->s_volume_name);
+ memtostr_pad(label, sbi->s_es->s_volume_name);
unlock_buffer(sbi->s_sbh);
if (copy_to_user(user_label, label, sizeof(label)))
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index e6769b97a970..6a95713f9193 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -151,10 +151,11 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
return bh;
}
- if (!bh && (type == INDEX || type == DIRENT_HTREE)) {
+ /* The first directory block must not be a hole. */
+ if (!bh && (type == INDEX || type == DIRENT_HTREE || block == 0)) {
ext4_error_inode(inode, func, line, block,
- "Directory hole found for htree %s block",
- (type == INDEX) ? "index" : "leaf");
+ "Directory hole found for htree %s block %u",
+ (type == INDEX) ? "index" : "leaf", block);
return ERR_PTR(-EFSCORRUPTED);
}
if (!bh)
@@ -2172,6 +2173,52 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
return err ? err : err2;
}
+static bool ext4_check_dx_root(struct inode *dir, struct dx_root *root)
+{
+ struct fake_dirent *fde;
+ const char *error_msg;
+ unsigned int rlen;
+ unsigned int blocksize = dir->i_sb->s_blocksize;
+ char *blockend = (char *)root + dir->i_sb->s_blocksize;
+
+ fde = &root->dot;
+ if (unlikely(fde->name_len != 1)) {
+ error_msg = "invalid name_len for '.'";
+ goto corrupted;
+ }
+ if (unlikely(strncmp(root->dot_name, ".", fde->name_len))) {
+ error_msg = "invalid name for '.'";
+ goto corrupted;
+ }
+ rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize);
+ if (unlikely((char *)fde + rlen >= blockend)) {
+ error_msg = "invalid rec_len for '.'";
+ goto corrupted;
+ }
+
+ fde = &root->dotdot;
+ if (unlikely(fde->name_len != 2)) {
+ error_msg = "invalid name_len for '..'";
+ goto corrupted;
+ }
+ if (unlikely(strncmp(root->dotdot_name, "..", fde->name_len))) {
+ error_msg = "invalid name for '..'";
+ goto corrupted;
+ }
+ rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize);
+ if (unlikely((char *)fde + rlen >= blockend)) {
+ error_msg = "invalid rec_len for '..'";
+ goto corrupted;
+ }
+
+ return true;
+
+corrupted:
+ EXT4_ERROR_INODE(dir, "Corrupt dir, %s, running e2fsck is recommended",
+ error_msg);
+ return false;
+}
+
/*
* This converts a one block unindexed directory to a 3 block indexed
* directory, and adds the dentry to the indexed directory.
@@ -2206,17 +2253,17 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
brelse(bh);
return retval;
}
+
root = (struct dx_root *) bh->b_data;
+ if (!ext4_check_dx_root(dir, root)) {
+ brelse(bh);
+ return -EFSCORRUPTED;
+ }
/* The 0th block becomes the root, move the dirents out */
fde = &root->dotdot;
de = (struct ext4_dir_entry_2 *)((char *)fde +
ext4_rec_len_from_disk(fde->rec_len, blocksize));
- if ((char *) de >= (((char *) root) + blocksize)) {
- EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
- brelse(bh);
- return -EFSCORRUPTED;
- }
len = ((char *) root) + (blocksize - csum_size) - (char *) de;
/* Allocate new block for the 0th block's dirents */
@@ -3038,10 +3085,7 @@ bool ext4_empty_dir(struct inode *inode)
EXT4_ERROR_INODE(inode, "invalid size");
return false;
}
- /* The first directory block must not be a hole,
- * so treat it as DIRENT_HTREE
- */
- bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
+ bh = ext4_read_dirblock(inode, 0, EITHER);
if (IS_ERR(bh))
return false;
@@ -3483,10 +3527,7 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
struct ext4_dir_entry_2 *de;
unsigned int offset;
- /* The first directory block must not be a hole, so
- * treat it as DIRENT_HTREE
- */
- bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
+ bh = ext4_read_dirblock(inode, 0, EITHER);
if (IS_ERR(bh)) {
*retval = PTR_ERR(bh);
return NULL;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index eb899628e121..e72145c4ae5a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1327,6 +1327,9 @@ static void ext4_put_super(struct super_block *sb)
ext4_group_desc_free(sbi);
ext4_flex_groups_free(sbi);
+
+ WARN_ON_ONCE(!(sbi->s_mount_state & EXT4_ERROR_FS) &&
+ percpu_counter_sum(&sbi->s_dirtyclusters_counter));
ext4_percpu_param_destroy(sbi);
#ifdef CONFIG_QUOTA
for (int i = 0; i < EXT4_MAXQUOTAS; i++)
@@ -1457,7 +1460,8 @@ static void ext4_destroy_inode(struct inode *inode)
dump_stack();
}
- if (EXT4_I(inode)->i_reserved_data_blocks)
+ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS) &&
+ WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks))
ext4_msg(inode->i_sb, KERN_ERR,
"Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!",
inode->i_ino, EXT4_I(inode),
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 6460879b9fcb..46ce2f21fef9 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1433,6 +1433,12 @@ retry:
goto out;
memcpy(bh->b_data, buf, csize);
+ /*
+ * Zero out block tail to avoid writing uninitialized memory
+ * to disk.
+ */
+ if (csize < blocksize)
+ memset(bh->b_data + csize, 0, blocksize - csize);
set_buffer_uptodate(bh);
ext4_handle_dirty_metadata(handle, ea_inode, bh);