summaryrefslogtreecommitdiff
path: root/fs/ext4
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/balloc.c6
-rw-r--r--fs/ext4/bitmap.c16
-rw-r--r--fs/ext4/block_validity.c5
-rw-r--r--fs/ext4/dir.c7
-rw-r--r--fs/ext4/ext4.h259
-rw-r--r--fs/ext4/ext4_extents.h7
-rw-r--r--fs/ext4/ext4_jbd2.c15
-rw-r--r--fs/ext4/ext4_jbd2.h117
-rw-r--r--fs/ext4/extents.c756
-rw-r--r--fs/ext4/extents_status.c36
-rw-r--r--fs/ext4/fast_commit.c460
-rw-r--r--fs/ext4/file.c62
-rw-r--r--fs/ext4/fsmap.c23
-rw-r--r--fs/ext4/fsync.c12
-rw-r--r--fs/ext4/hash.c2
-rw-r--r--fs/ext4/ialloc.c19
-rw-r--r--fs/ext4/indirect.c4
-rw-r--r--fs/ext4/inline.c299
-rw-r--r--fs/ext4/inode.c1242
-rw-r--r--fs/ext4/ioctl.c33
-rw-r--r--fs/ext4/mballoc-test.c7
-rw-r--r--fs/ext4/mballoc.c924
-rw-r--r--fs/ext4/mballoc.h9
-rw-r--r--fs/ext4/mmp.c8
-rw-r--r--fs/ext4/move_extent.c16
-rw-r--r--fs/ext4/namei.c210
-rw-r--r--fs/ext4/orphan.c20
-rw-r--r--fs/ext4/page-io.c83
-rw-r--r--fs/ext4/readpage.c28
-rw-r--r--fs/ext4/resize.c6
-rw-r--r--fs/ext4/super.c367
-rw-r--r--fs/ext4/sysfs.c4
-rw-r--r--fs/ext4/xattr.c61
-rw-r--r--fs/ext4/xattr.h10
34 files changed, 2914 insertions, 2219 deletions
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 8042ad873808..c9329ed5c094 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -649,8 +649,8 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
/* Hm, nope. Are (enough) root reserved clusters available? */
if (uid_eq(sbi->s_resuid, current_fsuid()) ||
(!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
- capable(CAP_SYS_RESOURCE) ||
- (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+ (flags & EXT4_MB_USE_ROOT_BLOCKS) ||
+ capable(CAP_SYS_RESOURCE)) {
if (free_clusters >= (nclusters + dirty_clusters +
resv_clusters))
@@ -703,7 +703,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
* possible we just missed a transaction commit that did so
*/
smp_mb();
- if (sbi->s_mb_free_pending == 0) {
+ if (atomic_read(&sbi->s_mb_free_pending) == 0) {
if (test_opt(sb, DISCARD)) {
atomic_inc(&sbi->s_retry_alloc_pending);
flush_work(&sbi->s_discard_work);
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 2a135075468d..87760fabdd2e 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -25,12 +25,12 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
int sz;
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return 1;
sz = EXT4_INODES_PER_GROUP(sb) >> 3;
provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
- calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) {
hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi);
provided |= (hi << 16);
@@ -48,11 +48,11 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
int sz;
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return;
sz = EXT4_INODES_PER_GROUP(sb) >> 3;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END)
gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16);
@@ -67,11 +67,11 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return 1;
provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
- calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) {
hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi);
provided |= (hi << 16);
@@ -89,10 +89,10 @@ void ext4_block_bitmap_csum_set(struct super_block *sb,
__u32 csum;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END)
gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16);
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 87ee3a17bd29..e8c5525afc67 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -351,10 +351,9 @@ int ext4_check_blockref(const char *function, unsigned int line,
{
__le32 *bref = p;
unsigned int blk;
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- if (ext4_has_feature_journal(inode->i_sb) &&
- (inode->i_ino ==
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
+ if (journal && inode == journal->j_inode)
return 0;
while (bref < p+max) {
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 02d47a64e8d1..d4164c507a90 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -86,7 +86,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
dir->i_sb->s_blocksize);
const int next_offset = ((char *) de - buf) + rlen;
bool fake = is_fake_dir_entry(de);
- bool has_csum = ext4_has_metadata_csum(dir->i_sb);
+ bool has_csum = ext4_has_feature_metadata_csum(dir->i_sb);
if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir)))
error_msg = "rec_len is smaller than minimal";
@@ -104,6 +104,9 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
else if (unlikely(le32_to_cpu(de->inode) >
le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
error_msg = "inode out of bounds";
+ else if (unlikely(next_offset == size && de->name_len == 1 &&
+ de->name[0] == '.'))
+ error_msg = "'.' directory cannot be the last in data block";
else
return 0;
@@ -145,7 +148,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
return err;
/* Can we just clear INDEX flag to ignore htree information? */
- if (!ext4_has_metadata_csum(sb)) {
+ if (!ext4_has_feature_metadata_csum(sb)) {
/*
* We don't set the inode dirty flag since it's not
* critical that it gets flushed back to the disk.
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4e7de7eaa374..01a6e2de7fc3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -157,7 +157,7 @@ enum criteria {
/*
* Reads each block group sequentially, performing disk IO if
- * necessary, to find find_suitable block group. Tries to
+ * necessary, to find suitable block group. Tries to
* allocate goal length but might trim the request if nothing
* is found after enough tries.
*/
@@ -185,14 +185,8 @@ enum criteria {
/* prefer goal again. length */
#define EXT4_MB_HINT_MERGE 0x0001
-/* blocks already reserved */
-#define EXT4_MB_HINT_RESERVED 0x0002
-/* metadata is being allocated */
-#define EXT4_MB_HINT_METADATA 0x0004
/* first blocks in the file */
#define EXT4_MB_HINT_FIRST 0x0008
-/* search for the best chunk */
-#define EXT4_MB_HINT_BEST 0x0010
/* data is being allocated */
#define EXT4_MB_HINT_DATA 0x0020
/* don't preallocate (for tails) */
@@ -213,15 +207,6 @@ enum criteria {
#define EXT4_MB_USE_RESERVED 0x2000
/* Do strict check for free blocks while retrying block allocation */
#define EXT4_MB_STRICT_CHECK 0x4000
-/* Large fragment size list lookup succeeded at least once for
- * CR_POWER2_ALIGNED */
-#define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000
-/* Avg fragment size rb tree lookup succeeded at least once for
- * CR_GOAL_LEN_FAST */
-#define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000
-/* Avg fragment size rb tree lookup succeeded at least once for
- * CR_BEST_AVAIL_LEN */
-#define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000
struct ext4_allocation_request {
/* target inode for block we're allocating */
@@ -256,9 +241,19 @@ struct ext4_allocation_request {
#define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten)
#define EXT4_MAP_BOUNDARY BIT(BH_Boundary)
#define EXT4_MAP_DELAYED BIT(BH_Delay)
+/*
+ * This is for use in ext4_map_query_blocks() for a special case where we can
+ * have a physically and logically contiguous blocks split across two leaf
+ * nodes instead of a single extent. This is required in case of atomic writes
+ * to know whether the returned extent is last in leaf. If yes, then lookup for
+ * next in leaf block in ext4_map_query_blocks_next_in_leaf().
+ * - This is never going to be added to any buffer head state.
+ * - We use the next available bit after BH_BITMAP_UPTODATE.
+ */
+#define EXT4_MAP_QUERY_LAST_IN_LEAF BIT(BH_BITMAP_UPTODATE + 1)
#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
- EXT4_MAP_DELAYED)
+ EXT4_MAP_DELAYED | EXT4_MAP_QUERY_LAST_IN_LEAF)
struct ext4_map_blocks {
ext4_fsblk_t m_pblk;
@@ -278,7 +273,10 @@ struct ext4_system_blocks {
/*
* Flags for ext4_io_end->flags
*/
-#define EXT4_IO_END_UNWRITTEN 0x0001
+#define EXT4_IO_END_UNWRITTEN 0x0001
+#define EXT4_IO_END_FAILED 0x0002
+
+#define EXT4_IO_END_DEFER_COMPLETION (EXT4_IO_END_UNWRITTEN | EXT4_IO_END_FAILED)
struct ext4_io_end_vec {
struct list_head list; /* list of io_end_vec */
@@ -367,6 +365,8 @@ struct ext4_io_submit {
#define EXT4_MAX_BLOCKS(size, offset, blkbits) \
((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \
blkbits))
+#define EXT4_B_TO_LBLK(inode, offset) \
+ (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits)
/* Translate a block number to a cluster number */
#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits)
@@ -701,9 +701,6 @@ enum {
#define EXT4_GET_BLOCKS_CONVERT 0x0010
#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\
EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
- /* Convert extent to initialized after IO complete */
-#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
- EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
/* Eventual metadata allocation (due to growing extent tree)
* should not fail, so try to use reserved blocks for that.*/
#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020
@@ -715,11 +712,23 @@ enum {
#define EXT4_GET_BLOCKS_ZERO 0x0200
#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\
EXT4_GET_BLOCKS_ZERO)
- /* Caller will submit data before dropping transaction handle. This
- * allows jbd2 to avoid submitting data before commit. */
+ /* Caller is in the context of data submission, such as writeback,
+ * fsync, etc. Especially, in the generic writeback path, caller will
+ * submit data before dropping transaction handle. This allows jbd2
+ * to avoid submitting data before commit. */
#define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400
+ /* Convert extent to initialized after IO complete */
+#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT |\
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |\
+ EXT4_GET_BLOCKS_IO_SUBMIT)
/* Caller is in the atomic contex, find extent if it has been cached */
#define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800
+/*
+ * Atomic write caller needs this to query in the slow path of mixed mapping
+ * case, when a contiguous extent can be split across two adjacent leaf nodes.
+ * Look EXT4_MAP_QUERY_LAST_IN_LEAF.
+ */
+#define EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF 0x1000
/*
* The bit position of these flags must not overlap with any of the
@@ -733,6 +742,13 @@ enum {
#define EXT4_EX_NOCACHE 0x40000000
#define EXT4_EX_FORCE_CACHE 0x20000000
#define EXT4_EX_NOFAIL 0x10000000
+/*
+ * ext4_map_query_blocks() uses this filter mask to filter the flags needed to
+ * pass while lookup/querying of on disk extent tree.
+ */
+#define EXT4_EX_QUERY_FILTER (EXT4_EX_NOCACHE | EXT4_EX_FORCE_CACHE |\
+ EXT4_EX_NOFAIL |\
+ EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF)
/*
* Flags used by ext4_free_blocks
@@ -1056,15 +1072,16 @@ struct ext4_inode_info {
/* End of lblk range that needs to be committed in this fast commit */
ext4_lblk_t i_fc_lblk_len;
- /* Number of ongoing updates on this inode */
- atomic_t i_fc_updates;
- atomic_t i_unwritten; /* Nr. of inflight conversions pending */
+ spinlock_t i_raw_lock; /* protects updates to the raw inode */
/* Fast commit wait queue for this inode */
wait_queue_head_t i_fc_wait;
- /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */
- struct mutex i_fc_lock;
+ /*
+ * Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len
+ * and inode's EXT4_FC_STATE_COMMITTING state bit.
+ */
+ spinlock_t i_fc_lock;
/*
* i_disksize keeps track of what the inode size is ON DISK, not
@@ -1097,8 +1114,6 @@ struct ext4_inode_info {
struct inode vfs_inode;
struct jbd2_inode *jinode;
- spinlock_t i_raw_lock; /* protects updates to the raw inode */
-
/*
* File creation time. Its function is same as that of
* struct timespec64 i_{a,c,m}time in the generic inode.
@@ -1141,6 +1156,7 @@ struct ext4_inode_info {
/* quota space reservation, managed internally by quota code */
qsize_t i_reserved_quota;
#endif
+ spinlock_t i_block_reservation_lock;
/* Lock protecting lists below */
spinlock_t i_completed_io_lock;
@@ -1151,8 +1167,6 @@ struct ext4_inode_info {
struct list_head i_rsv_conversion_list;
struct work_struct i_rsv_conversion_work;
- spinlock_t i_block_reservation_lock;
-
/*
* Transactions that contain inode's metadata needed to complete
* fsync and fdatasync, respectively.
@@ -1579,16 +1593,14 @@ struct ext4_sb_info {
unsigned short *s_mb_offsets;
unsigned int *s_mb_maxs;
unsigned int s_group_info_size;
- unsigned int s_mb_free_pending;
+ atomic_t s_mb_free_pending;
struct list_head s_freed_data_list[2]; /* List of blocks to be freed
after commit completed */
struct list_head s_discard_list;
struct work_struct s_discard_work;
atomic_t s_retry_alloc_pending;
- struct list_head *s_mb_avg_fragment_size;
- rwlock_t *s_mb_avg_fragment_size_locks;
- struct list_head *s_mb_largest_free_orders;
- rwlock_t *s_mb_largest_free_orders_locks;
+ struct xarray *s_mb_avg_fragment_size;
+ struct xarray *s_mb_largest_free_orders;
/* tunables */
unsigned long s_stripe;
@@ -1600,12 +1612,15 @@ struct ext4_sb_info {
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
unsigned int s_max_dir_size_kb;
- /* where last allocation was done - for stream allocation */
- unsigned long s_mb_last_group;
- unsigned long s_mb_last_start;
unsigned int s_mb_prefetch;
unsigned int s_mb_prefetch_limit;
unsigned int s_mb_best_avail_max_trim_order;
+ unsigned int s_sb_update_sec;
+ unsigned int s_sb_update_kb;
+
+ /* where last allocation was done - for stream allocation */
+ ext4_group_t *s_mb_last_groups;
+ unsigned int s_mb_nr_global_goals;
/* stats for buddy allocator */
atomic_t s_bal_reqs; /* number of reqs with len > 1 */
@@ -1615,12 +1630,10 @@ struct ext4_sb_info {
atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS]; /* total extents scanned */
atomic_t s_bal_groups_scanned; /* number of groups scanned */
atomic_t s_bal_goals; /* goal hits */
+ atomic_t s_bal_stream_goals; /* stream allocation global goal hits */
atomic_t s_bal_len_goals; /* len goal hits */
atomic_t s_bal_breaks; /* too long searches */
atomic_t s_bal_2orders; /* 2^order hits */
- atomic_t s_bal_p2_aligned_bad_suggestions;
- atomic_t s_bal_goal_fast_bad_suggestions;
- atomic_t s_bal_best_avail_bad_suggestions;
atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS];
atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS];
atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */
@@ -1749,7 +1762,7 @@ struct ext4_sb_info {
* following fields:
* ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh.
*/
- spinlock_t s_fc_lock;
+ struct mutex s_fc_lock;
struct buffer_head *s_fc_bh;
struct ext4_fc_stats s_fc_stats;
tid_t s_fc_ineligible_tid;
@@ -1821,7 +1834,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
*/
enum {
EXT4_MF_MNTDIR_SAMPLED,
- EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */
+ EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */
+ EXT4_MF_JOURNAL_DESTROY /* Journal is in process of destroying */
};
static inline void ext4_set_mount_flag(struct super_block *sb, int bit)
@@ -1907,6 +1921,7 @@ enum {
EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */
EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */
+ EXT4_STATE_FC_FLUSHING_DATA, /* Fast commit flushing data */
EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */
};
@@ -2232,15 +2247,32 @@ extern int ext4_feature_set_ok(struct super_block *sb, int readonly);
/*
* Superblock flags
*/
-#define EXT4_FLAGS_RESIZING 0
-#define EXT4_FLAGS_SHUTDOWN 1
-#define EXT4_FLAGS_BDEV_IS_DAX 2
+enum {
+ EXT4_FLAGS_RESIZING, /* Avoid superblock update and resize race */
+ EXT4_FLAGS_SHUTDOWN, /* Prevent access to the file system */
+ EXT4_FLAGS_BDEV_IS_DAX, /* Current block device support DAX */
+ EXT4_FLAGS_EMERGENCY_RO,/* Emergency read-only due to fs errors */
+};
static inline int ext4_forced_shutdown(struct super_block *sb)
{
return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags);
}
+static inline int ext4_emergency_ro(struct super_block *sb)
+{
+ return test_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags);
+}
+
+static inline int ext4_emergency_state(struct super_block *sb)
+{
+ if (unlikely(ext4_forced_shutdown(sb)))
+ return -EIO;
+ if (unlikely(ext4_emergency_ro(sb)))
+ return -EROFS;
+ return 0;
+}
+
/*
* Default values for user and/or group using reserved blocks
*/
@@ -2272,10 +2304,19 @@ static inline int ext4_forced_shutdown(struct super_block *sb)
#define EXT4_DEFM_NODELALLOC 0x0800
/*
- * Default journal batch times
+ * Default journal batch times and ioprio.
*/
#define EXT4_DEF_MIN_BATCH_TIME 0
#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
+#define EXT4_DEF_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+
+
+/*
+ * Default values for superblock update
+ */
+#define EXT4_DEF_SB_UPDATE_INTERVAL_SEC (3600) /* seconds (1 hour) */
+#define EXT4_DEF_SB_UPDATE_INTERVAL_KB (16384) /* kilobytes (16MB) */
+
/*
* Minimum number of groups in a flexgroup before we separate out
@@ -2457,8 +2498,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
#define DX_HASH_SIPHASH 6
#define DX_HASH_LAST DX_HASH_SIPHASH
-static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
- const void *address, unsigned int length)
+static inline u32 ext4_chksum(u32 crc, const void *address, unsigned int length)
{
return crc32c(crc, address, length);
}
@@ -2810,8 +2850,7 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
struct ext4_dir_entry_2 *dirent,
struct fscrypt_str *ent_name);
extern void ext4_htree_free_dir_info(struct dir_private_info *p);
-extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
- struct buffer_head *bh,
+extern int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh,
void *buf, int buf_size,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **dest_de);
@@ -2893,8 +2932,6 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle);
-void ext4_fc_start_update(struct inode *inode);
-void ext4_fc_stop_update(struct inode *inode);
void ext4_fc_del(struct inode *inode);
bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block);
void ext4_fc_replay_cleanup(struct super_block *sb);
@@ -2944,6 +2981,7 @@ static inline bool ext4_mb_cr_expensive(enum criteria cr)
void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
struct ext4_inode_info *ei);
int ext4_inode_is_fast_symlink(struct inode *inode);
+void ext4_check_map_extents_env(struct inode *inode);
struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
@@ -2964,6 +3002,7 @@ int ext4_walk_page_buffers(handle_t *handle,
struct buffer_head *bh));
int do_journal_get_write_access(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
+void ext4_set_inode_mapping_order(struct inode *inode);
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA 2
@@ -3001,13 +3040,17 @@ extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
+extern int ext4_truncate_page_cache_block_range(struct inode *inode,
+ loff_t start, loff_t end);
extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
-extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
+extern int ext4_chunk_trans_extent(struct inode *inode, int nrblocks);
+extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+ int pextents);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
@@ -3019,6 +3062,17 @@ extern void ext4_da_update_reserve_space(struct inode *inode,
extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
ext4_fsblk_t pblk, ext4_lblk_t len);
+static inline bool is_special_ino(struct super_block *sb, unsigned long ino)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ return (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) ||
+ ino == le32_to_cpu(es->s_usr_quota_inum) ||
+ ino == le32_to_cpu(es->s_grp_quota_inum) ||
+ ino == le32_to_cpu(es->s_prj_quota_inum) ||
+ ino == le32_to_cpu(es->s_orphan_file_inum);
+}
+
/* indirect.c */
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
@@ -3031,8 +3085,8 @@ extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
int ext4_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
-int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
+int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
extern void ext4_reset_inode_seed(struct inode *inode);
int ext4_update_overhead(struct super_block *sb, bool force);
int ext4_force_shutdown(struct super_block *sb, u32 flags);
@@ -3088,8 +3142,7 @@ extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wa
extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block);
extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
extern int ext4_calculate_overhead(struct super_block *sb);
-extern __le32 ext4_superblock_csum(struct super_block *sb,
- struct ext4_super_block *es);
+extern __le32 ext4_superblock_csum(struct ext4_super_block *es);
extern void ext4_superblock_csum_set(struct super_block *sb);
extern int ext4_alloc_flex_bg_array(struct super_block *sb,
ext4_group_t ngroup);
@@ -3259,14 +3312,10 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
extern int ext4_register_li_request(struct super_block *sb,
ext4_group_t first_not_zeroed);
-static inline int ext4_has_metadata_csum(struct super_block *sb)
-{
- return ext4_has_feature_metadata_csum(sb);
-}
-
static inline int ext4_has_group_desc_csum(struct super_block *sb)
{
- return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
+ return ext4_has_feature_gdt_csum(sb) ||
+ ext4_has_feature_metadata_csum(sb);
}
#define ext4_read_incompat_64bit_val(es, name) \
@@ -3351,6 +3400,13 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
return 1 << sbi->s_log_groups_per_flex;
}
+static inline loff_t ext4_get_maxbytes(struct inode *inode)
+{
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return inode->i_sb->s_maxbytes;
+ return EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+}
+
#define ext4_std_error(sb, errno) \
do { \
if ((errno)) \
@@ -3415,8 +3471,6 @@ struct ext4_group_info {
void *bb_bitmap;
#endif
struct rw_semaphore alloc_sem;
- struct list_head bb_avg_fragment_size_node;
- struct list_head bb_largest_free_order_node;
ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
* regions, index is order.
* bb_counters[3] = 5 means
@@ -3467,23 +3521,28 @@ static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
}
+static inline bool ext4_try_lock_group(struct super_block *sb, ext4_group_t group)
+{
+ if (!spin_trylock(ext4_group_lock_ptr(sb, group)))
+ return false;
+ /*
+ * We're able to grab the lock right away, so drop the lock
+ * contention counter.
+ */
+ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
+ return true;
+}
+
static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
{
- spinlock_t *lock = ext4_group_lock_ptr(sb, group);
- if (spin_trylock(lock))
- /*
- * We're able to grab the lock right away, so drop the
- * lock contention counter.
- */
- atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
- else {
+ if (!ext4_try_lock_group(sb, group)) {
/*
* The lock is busy, so bump the contention counter,
* and then wait on the spin lock.
*/
atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
EXT4_MAX_CONTENTION);
- spin_lock(lock);
+ spin_lock(ext4_group_lock_ptr(sb, group));
}
}
@@ -3538,6 +3597,7 @@ extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
extern int ext4_get_max_inline_size(struct inode *inode);
extern int ext4_find_inline_data_nolock(struct inode *inode);
extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
+extern void ext4_update_final_de(void *de_buf, int old_size, int new_size);
int ext4_readpage_inline(struct inode *inode, struct folio *folio);
extern int ext4_try_to_write_inline_data(struct address_space *mapping,
@@ -3546,11 +3606,11 @@ extern int ext4_try_to_write_inline_data(struct address_space *mapping,
struct folio **foliop);
int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
unsigned copied, struct folio *folio);
-extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
- struct inode *inode,
- loff_t pos, unsigned len,
- struct folio **foliop,
- void **fsdata);
+extern int ext4_generic_write_inline_data(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ struct folio **foliop,
+ void **fsdata, bool da);
extern int ext4_try_add_inline_entry(handle_t *handle,
struct ext4_filename *fname,
struct inode *dir, struct inode *inode);
@@ -3597,10 +3657,10 @@ static inline int ext4_has_inline_data(struct inode *inode)
extern const struct inode_operations ext4_dir_inode_operations;
extern const struct inode_operations ext4_special_inode_operations;
extern struct dentry *ext4_get_parent(struct dentry *child);
-extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
- struct ext4_dir_entry_2 *de,
- int blocksize, int csum_size,
- unsigned int parent_ino, int dotdot_real_len);
+extern int ext4_init_dirblock(handle_t *handle, struct inode *inode,
+ struct buffer_head *dir_block,
+ unsigned int parent_ino, void *inline_buf,
+ int inline_size);
extern void ext4_initialize_dirent_tail(struct buffer_head *bh,
unsigned int blocksize);
extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
@@ -3683,6 +3743,8 @@ extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
loff_t len);
extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
loff_t offset, ssize_t len);
+extern int ext4_convert_unwritten_extents_atomic(handle_t *handle,
+ struct inode *inode, loff_t offset, ssize_t len);
extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
ext4_io_end_t *io_end);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
@@ -3785,34 +3847,19 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}
-/* For ioend & aio unwritten conversion wait queues */
-#define EXT4_WQ_HASH_SZ 37
-#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\
- EXT4_WQ_HASH_SZ])
-extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
-
extern int ext4_resize_begin(struct super_block *sb);
extern int ext4_resize_end(struct super_block *sb, bool update_backups);
-static inline void ext4_set_io_unwritten_flag(struct inode *inode,
- struct ext4_io_end *io_end)
+static inline void ext4_set_io_unwritten_flag(struct ext4_io_end *io_end)
{
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN))
io_end->flag |= EXT4_IO_END_UNWRITTEN;
- atomic_inc(&EXT4_I(inode)->i_unwritten);
- }
}
static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
{
- struct inode *inode = io_end->inode;
-
- if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN)
io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
- /* Wake up anyone waiting on unwritten extent conversion */
- if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
- wake_up_all(ext4_ioend_wq(inode));
- }
}
extern const struct iomap_ops ext4_iomap_ops;
@@ -3835,7 +3882,9 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh)
static inline bool ext4_inode_can_atomic_write(struct inode *inode)
{
- return S_ISREG(inode->i_mode) && EXT4_SB(inode->i_sb)->s_awu_min > 0;
+ return S_ISREG(inode->i_mode) &&
+ ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+ EXT4_SB(inode->i_sb)->s_awu_min > 0;
}
extern int ext4_block_write_begin(handle_t *handle, struct folio *folio,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 26435f3a3094..c484125d963f 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -31,13 +31,6 @@
#define CHECK_BINSEARCH__
/*
- * If EXT_STATS is defined then stats numbers are collected.
- * These number will be displayed at umount time.
- */
-#define EXT_STATS_
-
-
-/*
* ext4_inode has i_block array (60 bytes total).
* The first 12 bytes store ext4_extent_header;
* the remainder stores an array of ext4_extent.
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index da4a82456383..b3e9b7bd7978 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -16,7 +16,8 @@ int ext4_inode_journal_mode(struct inode *inode)
ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
(ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
- !test_opt(inode->i_sb, DELALLOC))) {
+ !test_opt(inode->i_sb, DELALLOC) &&
+ !mapping_large_folio_support(inode->i_mapping))) {
/* We do not support data journalling for encrypted data */
if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
@@ -63,12 +64,14 @@ static void ext4_put_nojournal(handle_t *handle)
*/
static int ext4_journal_check_start(struct super_block *sb)
{
+ int ret;
journal_t *journal;
might_sleep();
- if (unlikely(ext4_forced_shutdown(sb)))
- return -EIO;
+ ret = ext4_emergency_state(sb);
+ if (unlikely(ret))
+ return ret;
if (WARN_ON_ONCE(sb_rdonly(sb)))
return -EROFS;
@@ -244,7 +247,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
}
} else
ext4_check_bdev_write_error(sb);
- if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb))
+ if (trigger_type == EXT4_JTR_NONE ||
+ !ext4_has_feature_metadata_csum(sb))
return 0;
BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT);
jbd2_journal_set_triggers(bh,
@@ -331,7 +335,8 @@ int __ext4_journal_get_create_access(const char *where, unsigned int line,
err);
return err;
}
- if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb))
+ if (trigger_type == EXT4_JTR_NONE ||
+ !ext4_has_feature_metadata_csum(sb))
return 0;
BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT);
jbd2_journal_set_triggers(bh,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 0c77697d5e90..63d17c5201b5 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -122,90 +122,6 @@
#define EXT4_HT_EXT_CONVERT 11
#define EXT4_HT_MAX 12
-/**
- * struct ext4_journal_cb_entry - Base structure for callback information.
- *
- * This struct is a 'seed' structure for a using with your own callback
- * structs. If you are using callbacks you must allocate one of these
- * or another struct of your own definition which has this struct
- * as it's first element and pass it to ext4_journal_callback_add().
- */
-struct ext4_journal_cb_entry {
- /* list information for other callbacks attached to the same handle */
- struct list_head jce_list;
-
- /* Function to call with this callback structure */
- void (*jce_func)(struct super_block *sb,
- struct ext4_journal_cb_entry *jce, int error);
-
- /* user data goes here */
-};
-
-/**
- * ext4_journal_callback_add: add a function to call after transaction commit
- * @handle: active journal transaction handle to register callback on
- * @func: callback function to call after the transaction has committed:
- * @sb: superblock of current filesystem for transaction
- * @jce: returned journal callback data
- * @rc: journal state at commit (0 = transaction committed properly)
- * @jce: journal callback data (internal and function private data struct)
- *
- * The registered function will be called in the context of the journal thread
- * after the transaction for which the handle was created has completed.
- *
- * No locks are held when the callback function is called, so it is safe to
- * call blocking functions from within the callback, but the callback should
- * not block or run for too long, or the filesystem will be blocked waiting for
- * the next transaction to commit. No journaling functions can be used, or
- * there is a risk of deadlock.
- *
- * There is no guaranteed calling order of multiple registered callbacks on
- * the same transaction.
- */
-static inline void _ext4_journal_callback_add(handle_t *handle,
- struct ext4_journal_cb_entry *jce)
-{
- /* Add the jce to transaction's private list */
- list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
-}
-
-static inline void ext4_journal_callback_add(handle_t *handle,
- void (*func)(struct super_block *sb,
- struct ext4_journal_cb_entry *jce,
- int rc),
- struct ext4_journal_cb_entry *jce)
-{
- struct ext4_sb_info *sbi =
- EXT4_SB(handle->h_transaction->t_journal->j_private);
-
- /* Add the jce to transaction's private list */
- jce->jce_func = func;
- spin_lock(&sbi->s_md_lock);
- _ext4_journal_callback_add(handle, jce);
- spin_unlock(&sbi->s_md_lock);
-}
-
-
-/**
- * ext4_journal_callback_del: delete a registered callback
- * @handle: active journal transaction handle on which callback was registered
- * @jce: registered journal callback entry to unregister
- * Return true if object was successfully removed
- */
-static inline bool ext4_journal_callback_try_del(handle_t *handle,
- struct ext4_journal_cb_entry *jce)
-{
- bool deleted;
- struct ext4_sb_info *sbi =
- EXT4_SB(handle->h_transaction->t_journal->j_private);
-
- spin_lock(&sbi->s_md_lock);
- deleted = !list_empty(&jce->jce_list);
- list_del_init(&jce->jce_list);
- spin_unlock(&sbi->s_md_lock);
- return deleted;
-}
-
int
ext4_mark_iloc_dirty(handle_t *handle,
struct inode *inode,
@@ -403,10 +319,10 @@ static inline int ext4_journal_ensure_credits(handle_t *handle, int credits,
revoke_creds, 0);
}
-static inline int ext4_journal_blocks_per_page(struct inode *inode)
+static inline int ext4_journal_blocks_per_folio(struct inode *inode)
{
if (EXT4_JOURNAL(inode) != NULL)
- return jbd2_journal_blocks_per_page(inode);
+ return jbd2_journal_blocks_per_folio(inode);
return 0;
}
@@ -513,4 +429,33 @@ static inline int ext4_should_dioread_nolock(struct inode *inode)
return 1;
}
+/*
+ * Pass journal explicitly as it may not be cached in the sbi->s_journal in some
+ * cases
+ */
+static inline int ext4_journal_destroy(struct ext4_sb_info *sbi, journal_t *journal)
+{
+ int err = 0;
+
+ /*
+ * At this point only two things can be operating on the journal.
+ * JBD2 thread performing transaction commit and s_sb_upd_work
+ * issuing sb update through the journal. Once we set
+ * EXT4_JOURNAL_DESTROY, new ext4_handle_error() calls will not
+ * queue s_sb_upd_work and ext4_force_commit() makes sure any
+ * ext4_handle_error() calls from the running transaction commit are
+ * finished. Hence no new s_sb_upd_work can be queued after we
+ * flush it here.
+ */
+ ext4_set_mount_flag(sbi->s_sb, EXT4_MF_JOURNAL_DESTROY);
+
+ ext4_force_commit(sbi->s_sb);
+ flush_work(&sbi->s_sb_upd_work);
+
+ err = jbd2_journal_destroy(journal);
+ sbi->s_journal = NULL;
+
+ return err;
+}
+
#endif /* _EXT4_JBD2_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a07a98a4b97a..ca5499e9412b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -50,10 +50,9 @@ static __le32 ext4_extent_block_csum(struct inode *inode,
struct ext4_extent_header *eh)
{
struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum;
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
+ csum = ext4_chksum(ei->i_csum_seed, (__u8 *)eh,
EXT4_EXTENT_TAIL_OFFSET(eh));
return cpu_to_le32(csum);
}
@@ -63,7 +62,7 @@ static int ext4_extent_block_csum_verify(struct inode *inode,
{
struct ext4_extent_tail *et;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return 1;
et = find_ext4_extent_tail(eh);
@@ -77,7 +76,7 @@ static void ext4_extent_block_csum_set(struct inode *inode,
{
struct ext4_extent_tail *et;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return;
et = find_ext4_extent_tail(eh);
@@ -611,6 +610,8 @@ int ext4_ext_precache(struct inode *inode)
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return 0; /* not an extent-mapped inode */
+ ext4_check_map_extents_env(inode);
+
down_read(&ei->i_data_sem);
depth = ext_depth(inode);
@@ -1530,7 +1531,7 @@ static int ext4_ext_search_left(struct inode *inode,
static int ext4_ext_search_right(struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t *logical, ext4_fsblk_t *phys,
- struct ext4_extent *ret_ex)
+ struct ext4_extent *ret_ex, int flags)
{
struct buffer_head *bh = NULL;
struct ext4_extent_header *eh;
@@ -1604,7 +1605,8 @@ got_index:
ix++;
while (++depth < path->p_depth) {
/* subtract from p_depth to get proper eh_depth */
- bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, ix, path->p_depth - depth,
+ flags);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
@@ -1612,7 +1614,7 @@ got_index:
put_bh(bh);
}
- bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, ix, path->p_depth - depth, flags);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
@@ -2396,18 +2398,20 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
{
int index;
- int depth;
/* If we are converting the inline data, only one is needed here. */
if (ext4_has_inline_data(inode))
return 1;
- depth = ext_depth(inode);
-
+ /*
+ * Extent tree can change between the time we estimate credits and
+ * the time we actually modify the tree. Assume the worst case.
+ */
if (extents <= 1)
- index = depth * 2;
+ index = (EXT4_MAX_EXTENT_DEPTH * 2) + extents;
else
- index = depth * 3;
+ index = (EXT4_MAX_EXTENT_DEPTH * 3) +
+ DIV_ROUND_UP(extents, ext4_ext_space_block(inode, 0));
return index;
}
@@ -2821,6 +2825,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
struct partial_cluster partial;
handle_t *handle;
int i = 0, err = 0;
+ int flags = EXT4_EX_NOCACHE | EXT4_EX_NOFAIL;
partial.pclu = 0;
partial.lblk = 0;
@@ -2851,8 +2856,7 @@ again:
ext4_fsblk_t pblk;
/* find extent for or closest extent to this block */
- path = ext4_find_extent(inode, end, NULL,
- EXT4_EX_NOCACHE | EXT4_EX_NOFAIL);
+ path = ext4_find_extent(inode, end, NULL, flags);
if (IS_ERR(path)) {
ext4_journal_stop(handle);
return PTR_ERR(path);
@@ -2918,7 +2922,7 @@ again:
*/
lblk = ex_end + 1;
err = ext4_ext_search_right(inode, path, &lblk, &pblk,
- NULL);
+ NULL, flags);
if (err < 0)
goto out;
if (pblk) {
@@ -2994,8 +2998,7 @@ again:
i + 1, ext4_idx_pblock(path[i].p_idx));
memset(path + i + 1, 0, sizeof(*path));
bh = read_extent_tree_block(inode, path[i].p_idx,
- depth - i - 1,
- EXT4_EX_NOCACHE);
+ depth - i - 1, flags);
if (IS_ERR(bh)) {
/* should we reset i_size? */
err = PTR_ERR(bh);
@@ -4202,7 +4205,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
/* find extent for this block */
- path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
+ path = ext4_find_extent(inode, map->m_lblk, NULL, flags);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out;
@@ -4314,7 +4317,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
if (err)
goto out;
ar.lright = map->m_lblk;
- err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
+ err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright,
+ &ex2, flags);
if (err < 0)
goto out;
@@ -4433,6 +4437,20 @@ got_allocated_blocks:
allocated = map->m_len;
ext4_ext_show_leaf(inode, path);
out:
+ /*
+ * We never use EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF with CREATE flag.
+ * So we know that the depth used here is correct, since there was no
+ * block allocation done if EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF is set.
+ * If tomorrow we start using this QUERY flag with CREATE, then we will
+ * need to re-calculate the depth as it might have changed due to block
+ * allocation.
+ */
+ if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) {
+ WARN_ON_ONCE(flags & EXT4_GET_BLOCKS_CREATE);
+ if (!err && ex && (ex == EXT_LAST_EXTENT(path[depth].p_hdr)))
+ map->m_flags |= EXT4_MAP_QUERY_LAST_IN_LEAF;
+ }
+
ext4_free_ext_path(path);
trace_ext4_ext_map_blocks_exit(inode, flags, map,
@@ -4483,6 +4501,8 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
struct ext4_map_blocks map;
unsigned int credits;
loff_t epos, old_size = i_size_read(inode);
+ unsigned int blkbits = inode->i_blkbits;
+ bool alloc_zero = false;
BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
map.m_lblk = offset;
@@ -4496,6 +4516,17 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
/*
+ * Do the actual write zero during a running journal transaction
+ * costs a lot. First allocate an unwritten extent and then
+ * convert it to written after zeroing it out.
+ */
+ if (flags & EXT4_GET_BLOCKS_ZERO) {
+ flags &= ~EXT4_GET_BLOCKS_ZERO;
+ flags |= EXT4_GET_BLOCKS_UNWRIT_EXT;
+ alloc_zero = true;
+ }
+
+ /*
* credits to insert 1 extent into extent tree
*/
credits = ext4_chunk_trans_blocks(inode, len);
@@ -4531,9 +4562,7 @@ retry:
* allow a full retry cycle for any remaining allocations
*/
retries = 0;
- map.m_lblk += ret;
- map.m_len = len = len - ret;
- epos = (loff_t)map.m_lblk << inode->i_blkbits;
+ epos = (loff_t)(map.m_lblk + ret) << blkbits;
inode_set_ctime_current(inode);
if (new_size) {
if (epos > new_size)
@@ -4553,6 +4582,21 @@ retry:
ret2 = ret3 ? ret3 : ret2;
if (unlikely(ret2))
break;
+
+ if (alloc_zero &&
+ (map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) {
+ ret2 = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk,
+ map.m_len);
+ if (likely(!ret2))
+ ret2 = ext4_convert_unwritten_extents(NULL,
+ inode, (loff_t)map.m_lblk << blkbits,
+ (loff_t)map.m_len << blkbits);
+ if (ret2)
+ break;
+ }
+
+ map.m_lblk += ret;
+ map.m_len = len = len - ret;
}
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
@@ -4568,131 +4612,69 @@ static long ext4_zero_range(struct file *file, loff_t offset,
loff_t len, int mode)
{
struct inode *inode = file_inode(file);
- struct address_space *mapping = file->f_mapping;
handle_t *handle = NULL;
- unsigned int max_blocks;
loff_t new_size = 0;
- int ret = 0;
- int flags;
- int credits;
- int partial_begin, partial_end;
- loff_t start, end;
- ext4_lblk_t lblk;
+ loff_t end = offset + len;
+ ext4_lblk_t start_lblk, end_lblk;
+ unsigned int blocksize = i_blocksize(inode);
unsigned int blkbits = inode->i_blkbits;
+ int ret, flags, credits;
trace_ext4_zero_range(inode, offset, len, mode);
+ WARN_ON_ONCE(!inode_is_locked(inode));
- /*
- * Round up offset. This is not fallocate, we need to zero out
- * blocks, so convert interior block aligned part of the range to
- * unwritten and possibly manually zero out unaligned parts of the
- * range. Here, start and partial_begin are inclusive, end and
- * partial_end are exclusive.
- */
- start = round_up(offset, 1 << blkbits);
- end = round_down((offset + len), 1 << blkbits);
-
- if (start < offset || end > offset + len)
- return -EINVAL;
- partial_begin = offset & ((1 << blkbits) - 1);
- partial_end = (offset + len) & ((1 << blkbits) - 1);
-
- lblk = start >> blkbits;
- max_blocks = (end >> blkbits);
- if (max_blocks < lblk)
- max_blocks = 0;
- else
- max_blocks -= lblk;
-
- inode_lock(inode);
-
- /*
- * Indirect files do not support unwritten extents
- */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
- ret = -EOPNOTSUPP;
- goto out_mutex;
- }
+ /* Indirect files do not support unwritten extents */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return -EOPNOTSUPP;
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- (offset + len > inode->i_size ||
- offset + len > EXT4_I(inode)->i_disksize)) {
- new_size = offset + len;
+ (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) {
+ new_size = end;
ret = inode_newsize_ok(inode, new_size);
if (ret)
- goto out_mutex;
+ return ret;
}
flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
-
- /* Wait all existing dio workers, newcomers will block on i_rwsem */
- inode_dio_wait(inode);
-
- ret = file_modified(file);
- if (ret)
- goto out_mutex;
-
/* Preallocate the range including the unaligned edges */
- if (partial_begin || partial_end) {
- ret = ext4_alloc_file_blocks(file,
- round_down(offset, 1 << blkbits) >> blkbits,
- (round_up((offset + len), 1 << blkbits) -
- round_down(offset, 1 << blkbits)) >> blkbits,
- new_size, flags);
- if (ret)
- goto out_mutex;
+ if (!IS_ALIGNED(offset | end, blocksize)) {
+ ext4_lblk_t alloc_lblk = offset >> blkbits;
+ ext4_lblk_t len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits);
+ ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk,
+ new_size, flags);
+ if (ret)
+ return ret;
}
- /* Zero range excluding the unaligned edges */
- if (max_blocks > 0) {
- flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
- EXT4_EX_NOCACHE);
-
- /*
- * Prevent page faults from reinstantiating pages we have
- * released from page cache.
- */
- filemap_invalidate_lock(mapping);
-
- ret = ext4_break_layouts(inode);
- if (ret) {
- filemap_invalidate_unlock(mapping);
- goto out_mutex;
- }
-
- ret = ext4_update_disksize_before_punch(inode, offset, len);
- if (ret) {
- filemap_invalidate_unlock(mapping);
- goto out_mutex;
- }
+ ret = ext4_update_disksize_before_punch(inode, offset, len);
+ if (ret)
+ return ret;
- /*
- * For journalled data we need to write (and checkpoint) pages
- * before discarding page cache to avoid inconsitent data on
- * disk in case of crash before zeroing trans is committed.
- */
- if (ext4_should_journal_data(inode)) {
- ret = filemap_write_and_wait_range(mapping, start,
- end - 1);
- if (ret) {
- filemap_invalidate_unlock(mapping);
- goto out_mutex;
- }
- }
+ /* Now release the pages and zero block aligned part of pages */
+ ret = ext4_truncate_page_cache_block_range(inode, offset, end);
+ if (ret)
+ return ret;
- /* Now release the pages and zero block aligned part of pages */
- truncate_pagecache_range(inode, start, end - 1);
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ /* Zero range excluding the unaligned edges */
+ start_lblk = EXT4_B_TO_LBLK(inode, offset);
+ end_lblk = end >> blkbits;
+ if (end_lblk > start_lblk) {
+ ext4_lblk_t zero_blks = end_lblk - start_lblk;
- ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
- flags);
- filemap_invalidate_unlock(mapping);
+ if (mode & FALLOC_FL_WRITE_ZEROES)
+ flags = EXT4_GET_BLOCKS_CREATE_ZERO | EXT4_EX_NOCACHE;
+ else
+ flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
+ EXT4_EX_NOCACHE);
+ ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks,
+ new_size, flags);
if (ret)
- goto out_mutex;
+ return ret;
}
- if (!partial_begin && !partial_end)
- goto out_mutex;
+ /* Finish zeroing out if it doesn't contain partial block */
+ if (IS_ALIGNED(offset | end, blocksize))
+ return ret;
/*
* In worst case we have to writeout two nonadjacent unwritten
@@ -4705,27 +4687,69 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_std_error(inode->i_sb, ret);
- goto out_mutex;
+ return ret;
}
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ /* Zero out partial block at the edges of the range */
+ ret = ext4_zero_partial_blocks(handle, inode, offset, len);
+ if (ret)
+ goto out_handle;
+
if (new_size)
ext4_update_inode_size(inode, new_size);
ret = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret))
goto out_handle;
- /* Zero out partial block at the edges of the range */
- ret = ext4_zero_partial_blocks(handle, inode, offset, len);
- if (ret >= 0)
- ext4_update_inode_fsync_trans(handle, inode, 1);
+ ext4_update_inode_fsync_trans(handle, inode, 1);
if (file->f_flags & O_SYNC)
ext4_handle_sync(handle);
out_handle:
ext4_journal_stop(handle);
-out_mutex:
- inode_unlock(inode);
+ return ret;
+}
+
+static long ext4_do_fallocate(struct file *file, loff_t offset,
+ loff_t len, int mode)
+{
+ struct inode *inode = file_inode(file);
+ loff_t end = offset + len;
+ loff_t new_size = 0;
+ ext4_lblk_t start_lblk, len_lblk;
+ int ret;
+
+ trace_ext4_fallocate_enter(inode, offset, len, mode);
+ WARN_ON_ONCE(!inode_is_locked(inode));
+
+ start_lblk = offset >> inode->i_blkbits;
+ len_lblk = EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits);
+
+ /* We only support preallocation for extent-based files only. */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) {
+ new_size = end;
+ ret = inode_newsize_ok(inode, new_size);
+ if (ret)
+ goto out;
+ }
+
+ ret = ext4_alloc_file_blocks(file, start_lblk, len_lblk, new_size,
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
+ if (ret)
+ goto out;
+
+ if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
+ ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
+ EXT4_I(inode)->i_sync_tid);
+ }
+out:
+ trace_ext4_fallocate_exit(inode, offset, len_lblk, ret);
return ret;
}
@@ -4739,12 +4763,8 @@ out_mutex:
long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
- loff_t new_size = 0;
- unsigned int max_blocks;
- int ret = 0;
- int flags;
- ext4_lblk_t lblk;
- unsigned int blkbits = inode->i_blkbits;
+ struct address_space *mapping = file->f_mapping;
+ int ret;
/*
* Encrypted inodes can't handle collapse range or insert
@@ -4755,83 +4775,158 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (IS_ENCRYPTED(inode) &&
(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
return -EOPNOTSUPP;
+ /*
+ * Don't allow writing zeroes if the underlying device does not
+ * enable the unmap write zeroes operation.
+ */
+ if ((mode & FALLOC_FL_WRITE_ZEROES) &&
+ !bdev_write_zeroes_unmap_sectors(inode->i_sb->s_bdev))
+ return -EOPNOTSUPP;
/* Return error if mode is not supported */
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
- FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
- FALLOC_FL_INSERT_RANGE))
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_COLLAPSE_RANGE |
+ FALLOC_FL_INSERT_RANGE | FALLOC_FL_WRITE_ZEROES))
return -EOPNOTSUPP;
inode_lock(inode);
ret = ext4_convert_inline_data(inode);
- inode_unlock(inode);
if (ret)
- goto exit;
+ goto out_inode_lock;
- if (mode & FALLOC_FL_PUNCH_HOLE) {
- ret = ext4_punch_hole(file, offset, len);
- goto exit;
- }
+ /* Wait all existing dio workers, newcomers will block on i_rwsem */
+ inode_dio_wait(inode);
- if (mode & FALLOC_FL_COLLAPSE_RANGE) {
- ret = ext4_collapse_range(file, offset, len);
- goto exit;
- }
+ ret = file_modified(file);
+ if (ret)
+ goto out_inode_lock;
- if (mode & FALLOC_FL_INSERT_RANGE) {
- ret = ext4_insert_range(file, offset, len);
- goto exit;
+ if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ALLOCATE_RANGE) {
+ ret = ext4_do_fallocate(file, offset, len, mode);
+ goto out_inode_lock;
}
- if (mode & FALLOC_FL_ZERO_RANGE) {
+ /*
+ * Follow-up operations will drop page cache, hold invalidate lock
+ * to prevent page faults from reinstantiating pages we have
+ * released from page cache.
+ */
+ filemap_invalidate_lock(mapping);
+
+ ret = ext4_break_layouts(inode);
+ if (ret)
+ goto out_invalidate_lock;
+
+ switch (mode & FALLOC_FL_MODE_MASK) {
+ case FALLOC_FL_PUNCH_HOLE:
+ ret = ext4_punch_hole(file, offset, len);
+ break;
+ case FALLOC_FL_COLLAPSE_RANGE:
+ ret = ext4_collapse_range(file, offset, len);
+ break;
+ case FALLOC_FL_INSERT_RANGE:
+ ret = ext4_insert_range(file, offset, len);
+ break;
+ case FALLOC_FL_ZERO_RANGE:
+ case FALLOC_FL_WRITE_ZEROES:
ret = ext4_zero_range(file, offset, len, mode);
- goto exit;
+ break;
+ default:
+ ret = -EOPNOTSUPP;
}
- trace_ext4_fallocate_enter(inode, offset, len, mode);
- lblk = offset >> blkbits;
+out_invalidate_lock:
+ filemap_invalidate_unlock(mapping);
+out_inode_lock:
+ inode_unlock(inode);
+ return ret;
+}
+
+/*
+ * This function converts a range of blocks to written extents. The caller of
+ * this function will pass the start offset and the size. all unwritten extents
+ * within this range will be converted to written extents.
+ *
+ * This function is called from the direct IO end io call back function for
+ * atomic writes, to convert the unwritten extents after IO is completed.
+ *
+ * Note that the requirement for atomic writes is that all conversion should
+ * happen atomically in a single fs journal transaction. We mainly only allocate
+ * unwritten extents either on a hole on a pre-exiting unwritten extent range in
+ * ext4_map_blocks_atomic_write(). The only case where we can have multiple
+ * unwritten extents in a range [offset, offset+len) is when there is a split
+ * unwritten extent between two leaf nodes which was cached in extent status
+ * cache during ext4_iomap_alloc() time. That will allow
+ * ext4_map_blocks_atomic_write() to return the unwritten extent range w/o going
+ * into the slow path. That means we might need a loop for conversion of this
+ * unwritten extent split across leaf block within a single journal transaction.
+ * Split extents across leaf nodes is a rare case, but let's still handle that
+ * to meet the requirements of multi-fsblock atomic writes.
+ *
+ * Returns 0 on success.
+ */
+int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode,
+ loff_t offset, ssize_t len)
+{
+ unsigned int max_blocks;
+ int ret = 0, ret2 = 0, ret3 = 0;
+ struct ext4_map_blocks map;
+ unsigned int blkbits = inode->i_blkbits;
+ unsigned int credits = 0;
+ int flags = EXT4_GET_BLOCKS_IO_CONVERT_EXT | EXT4_EX_NOCACHE;
+
+ map.m_lblk = offset >> blkbits;
max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
- flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
- inode_lock(inode);
+ if (!handle) {
+ /*
+ * TODO: An optimization can be added later by having an extent
+ * status flag e.g. EXTENT_STATUS_SPLIT_LEAF. If we query that
+ * it can tell if the extent in the cache is a split extent.
+ * But for now let's assume pextents as 2 always.
+ */
+ credits = ext4_meta_trans_blocks(inode, max_blocks, 2);
+ }
- /*
- * We only support preallocation for extent-based files only
- */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
- ret = -EOPNOTSUPP;
- goto out;
+ if (credits) {
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ return ret;
+ }
}
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- (offset + len > inode->i_size ||
- offset + len > EXT4_I(inode)->i_disksize)) {
- new_size = offset + len;
- ret = inode_newsize_ok(inode, new_size);
- if (ret)
- goto out;
+ while (ret >= 0 && ret < max_blocks) {
+ map.m_lblk += ret;
+ map.m_len = (max_blocks -= ret);
+ ret = ext4_map_blocks(handle, inode, &map, flags);
+ if (ret != max_blocks)
+ ext4_msg(inode->i_sb, KERN_INFO,
+ "inode #%lu: block %u: len %u: "
+ "split block mapping found for atomic write, "
+ "ret = %d",
+ inode->i_ino, map.m_lblk,
+ map.m_len, ret);
+ if (ret <= 0)
+ break;
}
- /* Wait all existing dio workers, newcomers will block on i_rwsem */
- inode_dio_wait(inode);
+ ret2 = ext4_mark_inode_dirty(handle, inode);
- ret = file_modified(file);
- if (ret)
- goto out;
+ if (credits) {
+ ret3 = ext4_journal_stop(handle);
+ if (unlikely(ret3))
+ ret2 = ret3;
+ }
- ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
- if (ret)
- goto out;
+ if (ret <= 0 || ret2)
+ ext4_warning(inode->i_sb,
+ "inode #%lu: block %u: len %u: "
+ "returned %d or %d",
+ inode->i_ino, map.m_lblk,
+ map.m_len, ret, ret2);
- if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
- ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
- EXT4_I(inode)->i_sync_tid);
- }
-out:
- inode_unlock(inode);
- trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
-exit:
- return ret;
+ return ret > 0 ? ret2 : ret;
}
/*
@@ -4873,8 +4968,14 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
break;
}
}
+ /*
+ * Do not cache any unrelated extents, as it does not hold the
+ * i_rwsem or invalidate_lock, which could corrupt the extent
+ * status tree.
+ */
ret = ext4_map_blocks(handle, inode, &map,
- EXT4_GET_BLOCKS_IO_CONVERT_EXT);
+ EXT4_GET_BLOCKS_IO_CONVERT_EXT |
+ EXT4_EX_NOCACHE);
if (ret <= 0)
ext4_warning(inode->i_sb,
"inode #%lu: block %u: len %u: "
@@ -4985,12 +5086,7 @@ static const struct iomap_ops ext4_iomap_xattr_ops = {
static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len)
{
- u64 maxbytes;
-
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- maxbytes = inode->i_sb->s_maxbytes;
- else
- maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+ u64 maxbytes = ext4_get_maxbytes(inode);
if (*len == 0)
return -EINVAL;
@@ -5010,10 +5106,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
{
int error = 0;
+ inode_lock_shared(inode);
if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
error = ext4_ext_precache(inode);
if (error)
- return error;
+ goto unlock;
fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
}
@@ -5024,15 +5121,19 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
*/
error = ext4_fiemap_check_ranges(inode, start, &len);
if (error)
- return error;
+ goto unlock;
if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
- return iomap_fiemap(inode, fieinfo, start, len,
- &ext4_iomap_xattr_ops);
+ error = iomap_fiemap(inode, fieinfo, start, len,
+ &ext4_iomap_xattr_ops);
+ } else {
+ error = iomap_fiemap(inode, fieinfo, start, len,
+ &ext4_iomap_report_ops);
}
-
- return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops);
+unlock:
+ inode_unlock_shared(inode);
+ return error;
}
int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -5053,7 +5154,9 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
+ inode_lock_shared(inode);
error = ext4_ext_precache(inode);
+ inode_unlock_shared(inode);
if (error)
return error;
fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
@@ -5112,7 +5215,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
credits = depth + 2;
}
- restart_credits = ext4_writepage_trans_blocks(inode);
+ restart_credits = ext4_chunk_trans_extent(inode, 0);
err = ext4_datasem_ensure_credits(handle, inode, credits,
restart_credits, 0);
if (err) {
@@ -5332,109 +5435,74 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct address_space *mapping = inode->i_mapping;
- ext4_lblk_t punch_start, punch_stop;
+ loff_t end = offset + len;
+ ext4_lblk_t start_lblk, end_lblk;
handle_t *handle;
unsigned int credits;
- loff_t new_size, ioffset;
+ loff_t start, new_size;
int ret;
- /*
- * We need to test this early because xfstests assumes that a
- * collapse range of (0, 1) will return EOPNOTSUPP if the file
- * system does not support collapse range.
- */
+ trace_ext4_collapse_range(inode, offset, len);
+ WARN_ON_ONCE(!inode_is_locked(inode));
+
+ /* Currently just for extent based files */
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return -EOPNOTSUPP;
-
/* Collapse range works only on fs cluster size aligned regions. */
if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
return -EINVAL;
-
- trace_ext4_collapse_range(inode, offset, len);
-
- punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
- punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
-
- inode_lock(inode);
/*
* There is no need to overlap collapse range with EOF, in which case
* it is effectively a truncate operation
*/
- if (offset + len >= inode->i_size) {
- ret = -EINVAL;
- goto out_mutex;
- }
-
- /* Currently just for extent based files */
- if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- ret = -EOPNOTSUPP;
- goto out_mutex;
- }
-
- /* Wait for existing dio to complete */
- inode_dio_wait(inode);
-
- ret = file_modified(file);
- if (ret)
- goto out_mutex;
-
- /*
- * Prevent page faults from reinstantiating pages we have released from
- * page cache.
- */
- filemap_invalidate_lock(mapping);
-
- ret = ext4_break_layouts(inode);
- if (ret)
- goto out_mmap;
+ if (end >= inode->i_size)
+ return -EINVAL;
/*
+ * Write tail of the last page before removed range and data that
+ * will be shifted since they will get removed from the page cache
+ * below. We are also protected from pages becoming dirty by
+ * i_rwsem and invalidate_lock.
* Need to round down offset to be aligned with page size boundary
* for page size > block size.
*/
- ioffset = round_down(offset, PAGE_SIZE);
- /*
- * Write tail of the last page before removed range since it will get
- * removed from the page cache below.
- */
- ret = filemap_write_and_wait_range(mapping, ioffset, offset);
- if (ret)
- goto out_mmap;
- /*
- * Write data that will be shifted to preserve them when discarding
- * page cache below. We are also protected from pages becoming dirty
- * by i_rwsem and invalidate_lock.
- */
- ret = filemap_write_and_wait_range(mapping, offset + len,
- LLONG_MAX);
+ start = round_down(offset, PAGE_SIZE);
+ ret = filemap_write_and_wait_range(mapping, start, offset);
+ if (!ret)
+ ret = filemap_write_and_wait_range(mapping, end, LLONG_MAX);
if (ret)
- goto out_mmap;
- truncate_pagecache(inode, ioffset);
+ return ret;
+
+ truncate_pagecache(inode, start);
- credits = ext4_writepage_trans_blocks(inode);
+ credits = ext4_chunk_trans_extent(inode, 0);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_mmap;
- }
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
+ start_lblk = offset >> inode->i_blkbits;
+ end_lblk = (offset + len) >> inode->i_blkbits;
+
+ ext4_check_map_extents_env(inode);
+
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
- ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start);
+ ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk);
- ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
+ ret = ext4_ext_remove_space(inode, start_lblk, end_lblk - 1);
if (ret) {
up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
+ goto out_handle;
}
ext4_discard_preallocations(inode);
- ret = ext4_ext_shift_extents(inode, handle, punch_stop,
- punch_stop - punch_start, SHIFT_LEFT);
+ ret = ext4_ext_shift_extents(inode, handle, end_lblk,
+ end_lblk - start_lblk, SHIFT_LEFT);
if (ret) {
up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
+ goto out_handle;
}
new_size = inode->i_size - len;
@@ -5442,18 +5510,16 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
EXT4_I(inode)->i_disksize = new_size;
up_write(&EXT4_I(inode)->i_data_sem);
- if (IS_SYNC(inode))
- ext4_handle_sync(handle);
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret = ext4_mark_inode_dirty(handle, inode);
+ if (ret)
+ goto out_handle;
+
ext4_update_inode_fsync_trans(handle, inode, 1);
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
-out_stop:
+out_handle:
ext4_journal_stop(handle);
-out_mmap:
- filemap_invalidate_unlock(mapping);
-out_mutex:
- inode_unlock(inode);
return ret;
}
@@ -5473,100 +5539,65 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
handle_t *handle;
struct ext4_ext_path *path;
struct ext4_extent *extent;
- ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
+ ext4_lblk_t start_lblk, len_lblk, ee_start_lblk = 0;
unsigned int credits, ee_len;
- int ret = 0, depth, split_flag = 0;
- loff_t ioffset;
+ int ret, depth, split_flag = 0;
+ loff_t start;
- /*
- * We need to test this early because xfstests assumes that an
- * insert range of (0, 1) will return EOPNOTSUPP if the file
- * system does not support insert range.
- */
+ trace_ext4_insert_range(inode, offset, len);
+ WARN_ON_ONCE(!inode_is_locked(inode));
+
+ /* Currently just for extent based files */
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return -EOPNOTSUPP;
-
/* Insert range works only on fs cluster size aligned regions. */
if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
return -EINVAL;
-
- trace_ext4_insert_range(inode, offset, len);
-
- offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
- len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
-
- inode_lock(inode);
- /* Currently just for extent based files */
- if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- ret = -EOPNOTSUPP;
- goto out_mutex;
- }
-
- /* Check whether the maximum file size would be exceeded */
- if (len > inode->i_sb->s_maxbytes - inode->i_size) {
- ret = -EFBIG;
- goto out_mutex;
- }
-
/* Offset must be less than i_size */
- if (offset >= inode->i_size) {
- ret = -EINVAL;
- goto out_mutex;
- }
-
- /* Wait for existing dio to complete */
- inode_dio_wait(inode);
-
- ret = file_modified(file);
- if (ret)
- goto out_mutex;
+ if (offset >= inode->i_size)
+ return -EINVAL;
+ /* Check whether the maximum file size would be exceeded */
+ if (len > inode->i_sb->s_maxbytes - inode->i_size)
+ return -EFBIG;
/*
- * Prevent page faults from reinstantiating pages we have released from
- * page cache.
+ * Write out all dirty pages. Need to round down to align start offset
+ * to page size boundary for page size > block size.
*/
- filemap_invalidate_lock(mapping);
-
- ret = ext4_break_layouts(inode);
+ start = round_down(offset, PAGE_SIZE);
+ ret = filemap_write_and_wait_range(mapping, start, LLONG_MAX);
if (ret)
- goto out_mmap;
+ return ret;
- /*
- * Need to round down to align start offset to page size boundary
- * for page size > block size.
- */
- ioffset = round_down(offset, PAGE_SIZE);
- /* Write out all dirty pages */
- ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
- LLONG_MAX);
- if (ret)
- goto out_mmap;
- truncate_pagecache(inode, ioffset);
+ truncate_pagecache(inode, start);
- credits = ext4_writepage_trans_blocks(inode);
+ credits = ext4_chunk_trans_extent(inode, 0);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_mmap;
- }
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
/* Expand file to avoid data loss if there is error while shifting */
inode->i_size += len;
EXT4_I(inode)->i_disksize += len;
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret = ext4_mark_inode_dirty(handle, inode);
if (ret)
- goto out_stop;
+ goto out_handle;
+
+ start_lblk = offset >> inode->i_blkbits;
+ len_lblk = len >> inode->i_blkbits;
+
+ ext4_check_map_extents_env(inode);
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
- path = ext4_find_extent(inode, offset_lblk, NULL, 0);
+ path = ext4_find_extent(inode, start_lblk, NULL, 0);
if (IS_ERR(path)) {
up_write(&EXT4_I(inode)->i_data_sem);
ret = PTR_ERR(path);
- goto out_stop;
+ goto out_handle;
}
depth = ext_depth(inode);
@@ -5576,16 +5607,16 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
ee_len = ext4_ext_get_actual_len(extent);
/*
- * If offset_lblk is not the starting block of extent, split
- * the extent @offset_lblk
+ * If start_lblk is not the starting block of extent, split
+ * the extent @start_lblk
*/
- if ((offset_lblk > ee_start_lblk) &&
- (offset_lblk < (ee_start_lblk + ee_len))) {
+ if ((start_lblk > ee_start_lblk) &&
+ (start_lblk < (ee_start_lblk + ee_len))) {
if (ext4_ext_is_unwritten(extent))
split_flag = EXT4_EXT_MARK_UNWRIT1 |
EXT4_EXT_MARK_UNWRIT2;
path = ext4_split_extent_at(handle, inode, path,
- offset_lblk, split_flag,
+ start_lblk, split_flag,
EXT4_EX_NOCACHE |
EXT4_GET_BLOCKS_PRE_IO |
EXT4_GET_BLOCKS_METADATA_NOFAIL);
@@ -5594,32 +5625,29 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
if (IS_ERR(path)) {
up_write(&EXT4_I(inode)->i_data_sem);
ret = PTR_ERR(path);
- goto out_stop;
+ goto out_handle;
}
}
ext4_free_ext_path(path);
- ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk);
+ ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk);
/*
- * if offset_lblk lies in a hole which is at start of file, use
+ * if start_lblk lies in a hole which is at start of file, use
* ee_start_lblk to shift extents
*/
ret = ext4_ext_shift_extents(inode, handle,
- max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT);
-
+ max(ee_start_lblk, start_lblk), len_lblk, SHIFT_RIGHT);
up_write(&EXT4_I(inode)->i_data_sem);
+ if (ret)
+ goto out_handle;
+
+ ext4_update_inode_fsync_trans(handle, inode, 1);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- if (ret >= 0)
- ext4_update_inode_fsync_trans(handle, inode, 1);
-out_stop:
+out_handle:
ext4_journal_stop(handle);
-out_mmap:
- filemap_invalidate_unlock(mapping);
-out_mutex:
- inode_unlock(inode);
return ret;
}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index ae29832aab1e..31dc0496f8d0 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -120,9 +120,40 @@
* memory. Hence, we will reclaim written/unwritten/hole extents from
* the tree under a heavy memory pressure.
*
+ * ==========================================================================
+ * 3. Assurance of Ext4 extent status tree consistency
+ *
+ * When mapping blocks, Ext4 queries the extent status tree first and should
+ * always trusts that the extent status tree is consistent and up to date.
+ * Therefore, it is important to adheres to the following rules when createing,
+ * modifying and removing extents.
+ *
+ * 1. Besides fastcommit replay, when Ext4 creates or queries block mappings,
+ * the extent information should always be processed through the extent
+ * status tree instead of being organized manually through the on-disk
+ * extent tree.
+ *
+ * 2. When updating the extent tree, Ext4 should acquire the i_data_sem
+ * exclusively and update the extent status tree atomically. If the extents
+ * to be modified are large enough to exceed the range that a single
+ * i_data_sem can process (as ext4_datasem_ensure_credits() may drop
+ * i_data_sem to restart a transaction), it must (e.g. as ext4_punch_hole()
+ * does):
+ *
+ * a) Hold the i_rwsem and invalidate_lock exclusively. This ensures
+ * exclusion against page faults, as well as reads and writes that may
+ * concurrently modify the extent status tree.
+ * b) Evict all page cache in the affected range and recommend rebuilding
+ * or dropping the extent status tree after modifying the on-disk
+ * extent tree. This ensures exclusion against concurrent writebacks
+ * that do not hold those locks but only holds a folio lock.
+ *
+ * 3. Based on the rules above, when querying block mappings, Ext4 should at
+ * least hold the i_rwsem or invalidate_lock or folio lock(s) for the
+ * specified querying range.
*
* ==========================================================================
- * 3. Performance analysis
+ * 4. Performance analysis
*
* -- overhead
* 1. There is a cache extent for write access, so if writes are
@@ -134,7 +165,7 @@
*
*
* ==========================================================================
- * 4. TODO list
+ * 5. TODO list
*
* -- Refactor delayed space reservation
*
@@ -1551,7 +1582,6 @@ retry:
ext4_es_print_tree(inode);
ext4_da_release_space(inode, reserved);
- return;
}
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index da4263a14a20..42bee1d4f9f9 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -12,6 +12,7 @@
#include "ext4_extents.h"
#include "mballoc.h"
+#include <linux/lockdep.h>
/*
* Ext4 Fast Commits
* -----------------
@@ -49,19 +50,27 @@
* that need to be committed during a fast commit in another in memory queue of
* inodes. During the commit operation, we commit in the following order:
*
- * [1] Lock inodes for any further data updates by setting COMMITTING state
- * [2] Submit data buffers of all the inodes
- * [3] Wait for [2] to complete
- * [4] Commit all the directory entry updates in the fast commit space
- * [5] Commit all the changed inode structures
- * [6] Write tail tag (this tag ensures the atomicity, please read the following
+ * [1] Prepare all the inodes to write out their data by setting
+ * "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be
+ * deleted while it is being flushed.
+ * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA"
+ * state.
+ * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that
+ * all the exsiting handles finish and no new handles can start.
+ * [4] Mark all the fast commit eligible inodes as undergoing fast commit
+ * by setting "EXT4_STATE_FC_COMMITTING" state.
+ * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows
+ * starting of new handles. If new handles try to start an update on
+ * any of the inodes that are being committed, ext4_fc_track_inode()
+ * will block until those inodes have finished the fast commit.
+ * [6] Commit all the directory entry updates in the fast commit space.
+ * [7] Commit all the changed inodes in the fast commit space and clear
+ * "EXT4_STATE_FC_COMMITTING" for these inodes.
+ * [8] Write tail tag (this tag ensures the atomicity, please read the following
* section for more details).
- * [7] Wait for [4], [5] and [6] to complete.
*
- * All the inode updates must call ext4_fc_start_update() before starting an
- * update. If such an ongoing update is present, fast commit waits for it to
- * complete. The completion of such an update is marked by
- * ext4_fc_stop_update().
+ * All the inode updates must be enclosed within jbd2_jounrnal_start()
+ * and jbd2_journal_stop() similar to JBD2 journaling.
*
* Fast Commit Ineligibility
* -------------------------
@@ -142,6 +151,13 @@
* similarly. Thus, by converting a non-idempotent procedure into a series of
* idempotent outcomes, fast commits ensured idempotence during the replay.
*
+ * Locking
+ * -------
+ * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit
+ * dentry queue. ei->i_fc_lock protects the fast commit related info in a given
+ * inode. Most of the code avoids acquiring both the locks, but if one must do
+ * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock.
+ *
* TODOs
* -----
*
@@ -156,13 +172,12 @@
* fast commit recovery even if that area is invalidated by later full
* commits.
*
- * 1) Fast commit's commit path locks the entire file system during fast
- * commit. This has significant performance penalty. Instead of that, we
- * should use ext4_fc_start/stop_update functions to start inode level
- * updates from ext4_journal_start/stop. Once we do that we can drop file
- * system locking during commit path.
+ * 1) Handle more ineligible cases.
*
- * 2) Handle more ineligible cases.
+ * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent
+ * status tree. This would get rid of the need to call ext4_fc_track_inode()
+ * before acquiring i_data_sem. To do that we would need to ensure that
+ * modified extents from the extent status tree are not evicted from memory.
*/
#include <trace/events/ext4.h>
@@ -201,32 +216,6 @@ void ext4_fc_init_inode(struct inode *inode)
INIT_LIST_HEAD(&ei->i_fc_list);
INIT_LIST_HEAD(&ei->i_fc_dilist);
init_waitqueue_head(&ei->i_fc_wait);
- atomic_set(&ei->i_fc_updates, 0);
-}
-
-/* This function must be called with sbi->s_fc_lock held. */
-static void ext4_fc_wait_committing_inode(struct inode *inode)
-__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
-{
- wait_queue_head_t *wq;
- struct ext4_inode_info *ei = EXT4_I(inode);
-
-#if (BITS_PER_LONG < 64)
- DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
- EXT4_STATE_FC_COMMITTING);
- wq = bit_waitqueue(&ei->i_state_flags,
- EXT4_STATE_FC_COMMITTING);
-#else
- DEFINE_WAIT_BIT(wait, &ei->i_flags,
- EXT4_STATE_FC_COMMITTING);
- wq = bit_waitqueue(&ei->i_flags,
- EXT4_STATE_FC_COMMITTING);
-#endif
- lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
- prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
- spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
- schedule();
- finish_wait(wq, &wait.wq_entry);
}
static bool ext4_fc_disabled(struct super_block *sb)
@@ -236,48 +225,6 @@ static bool ext4_fc_disabled(struct super_block *sb)
}
/*
- * Inform Ext4's fast about start of an inode update
- *
- * This function is called by the high level call VFS callbacks before
- * performing any inode update. This function blocks if there's an ongoing
- * fast commit on the inode in question.
- */
-void ext4_fc_start_update(struct inode *inode)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
-
- if (ext4_fc_disabled(inode->i_sb))
- return;
-
-restart:
- spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
- if (list_empty(&ei->i_fc_list))
- goto out;
-
- if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
- ext4_fc_wait_committing_inode(inode);
- goto restart;
- }
-out:
- atomic_inc(&ei->i_fc_updates);
- spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
-}
-
-/*
- * Stop inode update and wake up waiting fast commits if any.
- */
-void ext4_fc_stop_update(struct inode *inode)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
-
- if (ext4_fc_disabled(inode->i_sb))
- return;
-
- if (atomic_dec_and_test(&ei->i_fc_updates))
- wake_up_all(&ei->i_fc_wait);
-}
-
-/*
* Remove inode from fast commit list. If the inode is being committed
* we wait until inode commit is done.
*/
@@ -286,31 +233,62 @@ void ext4_fc_del(struct inode *inode)
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_fc_dentry_update *fc_dentry;
+ wait_queue_head_t *wq;
if (ext4_fc_disabled(inode->i_sb))
return;
-restart:
- spin_lock(&sbi->s_fc_lock);
+ mutex_lock(&sbi->s_fc_lock);
if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
return;
}
- if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
- ext4_fc_wait_committing_inode(inode);
- goto restart;
+ /*
+ * Since ext4_fc_del is called from ext4_evict_inode while having a
+ * handle open, there is no need for us to wait here even if a fast
+ * commit is going on. That is because, if this inode is being
+ * committed, ext4_mark_inode_dirty would have waited for inode commit
+ * operation to finish before we come here. So, by the time we come
+ * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So,
+ * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode
+ * here.
+ *
+ * We may come here without any handles open in the "no_delete" case of
+ * ext4_evict_inode as well. However, if that happens, we first mark the
+ * file system as fast commit ineligible anyway. So, even in that case,
+ * it is okay to remove the inode from the fc list.
+ */
+ WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)
+ && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE));
+ while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
+#if (BITS_PER_LONG < 64)
+ DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
+ EXT4_STATE_FC_FLUSHING_DATA);
+ wq = bit_waitqueue(&ei->i_state_flags,
+ EXT4_STATE_FC_FLUSHING_DATA);
+#else
+ DEFINE_WAIT_BIT(wait, &ei->i_flags,
+ EXT4_STATE_FC_FLUSHING_DATA);
+ wq = bit_waitqueue(&ei->i_flags,
+ EXT4_STATE_FC_FLUSHING_DATA);
+#endif
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
+ mutex_unlock(&sbi->s_fc_lock);
+ schedule();
+ mutex_lock(&sbi->s_fc_lock);
+ }
+ finish_wait(wq, &wait.wq_entry);
}
-
- if (!list_empty(&ei->i_fc_list))
- list_del_init(&ei->i_fc_list);
+ list_del_init(&ei->i_fc_list);
/*
* Since this inode is getting removed, let's also remove all FC
* dentry create references, since it is not needed to log it anyways.
*/
if (list_empty(&ei->i_fc_dilist)) {
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
return;
}
@@ -320,12 +298,10 @@ restart:
list_del_init(&fc_dentry->fcd_dilist);
WARN_ON(!list_empty(&ei->i_fc_dilist));
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
release_dentry_name_snapshot(&fc_dentry->fcd_name);
kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
-
- return;
}
/*
@@ -353,12 +329,12 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl
has_transaction = false;
read_unlock(&sbi->s_journal->j_state_lock);
}
- spin_lock(&sbi->s_fc_lock);
+ mutex_lock(&sbi->s_fc_lock);
is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid)))
sbi->s_fc_ineligible_tid = tid;
ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
WARN_ON(reason >= EXT4_FC_REASON_MAX);
sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
}
@@ -385,7 +361,7 @@ static int ext4_fc_track_template(
int ret;
tid = handle->h_transaction->t_tid;
- mutex_lock(&ei->i_fc_lock);
+ spin_lock(&ei->i_fc_lock);
if (tid == ei->i_sync_tid) {
update = true;
} else {
@@ -393,19 +369,18 @@ static int ext4_fc_track_template(
ei->i_sync_tid = tid;
}
ret = __fc_track_fn(handle, inode, args, update);
- mutex_unlock(&ei->i_fc_lock);
-
+ spin_unlock(&ei->i_fc_lock);
if (!enqueue)
return ret;
- spin_lock(&sbi->s_fc_lock);
+ mutex_lock(&sbi->s_fc_lock);
if (list_empty(&EXT4_I(inode)->i_fc_list))
list_add_tail(&EXT4_I(inode)->i_fc_list,
(sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
&sbi->s_fc_q[FC_Q_STAGING] :
&sbi->s_fc_q[FC_Q_MAIN]);
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
return ret;
}
@@ -428,19 +403,19 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode,
struct super_block *sb = inode->i_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- mutex_unlock(&ei->i_fc_lock);
+ spin_unlock(&ei->i_fc_lock);
if (IS_ENCRYPTED(dir)) {
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME,
handle);
- mutex_lock(&ei->i_fc_lock);
+ spin_lock(&ei->i_fc_lock);
return -EOPNOTSUPP;
}
node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
if (!node) {
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle);
- mutex_lock(&ei->i_fc_lock);
+ spin_lock(&ei->i_fc_lock);
return -ENOMEM;
}
@@ -449,7 +424,8 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode,
node->fcd_ino = inode->i_ino;
take_dentry_name_snapshot(&node->fcd_name, dentry);
INIT_LIST_HEAD(&node->fcd_dilist);
- spin_lock(&sbi->s_fc_lock);
+ INIT_LIST_HEAD(&node->fcd_list);
+ mutex_lock(&sbi->s_fc_lock);
if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
list_add_tail(&node->fcd_list,
@@ -470,8 +446,8 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode,
WARN_ON(!list_empty(&ei->i_fc_dilist));
list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
}
- spin_unlock(&sbi->s_fc_lock);
- mutex_lock(&ei->i_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
+ spin_lock(&ei->i_fc_lock);
return 0;
}
@@ -571,6 +547,8 @@ static int __track_inode(handle_t *handle, struct inode *inode, void *arg,
void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ wait_queue_head_t *wq;
int ret;
if (S_ISDIR(inode->i_mode))
@@ -588,6 +566,35 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
return;
+ /*
+ * If we come here, we may sleep while waiting for the inode to
+ * commit. We shouldn't be holding i_data_sem when we go to sleep since
+ * the commit path needs to grab the lock while committing the inode.
+ */
+ lockdep_assert_not_held(&ei->i_data_sem);
+
+ while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
+#if (BITS_PER_LONG < 64)
+ DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
+ EXT4_STATE_FC_COMMITTING);
+ wq = bit_waitqueue(&ei->i_state_flags,
+ EXT4_STATE_FC_COMMITTING);
+#else
+ DEFINE_WAIT_BIT(wait, &ei->i_flags,
+ EXT4_STATE_FC_COMMITTING);
+ wq = bit_waitqueue(&ei->i_flags,
+ EXT4_STATE_FC_COMMITTING);
+#endif
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
+ schedule();
+ finish_wait(wq, &wait.wq_entry);
+ }
+
+ /*
+ * From this point on, this inode will not be committed either
+ * by fast or full commit as long as the handle is open.
+ */
ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
trace_ext4_fc_track_inode(handle, inode, ret);
}
@@ -727,7 +734,7 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
tl.fc_len = cpu_to_le16(remaining);
memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining);
- *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize);
+ *crc = ext4_chksum(*crc, sbi->s_fc_bh->b_data, bsize);
ext4_fc_submit_bh(sb, false);
@@ -774,7 +781,7 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid));
dst += sizeof(tail.fc_tid);
- crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data,
+ crc = ext4_chksum(crc, sbi->s_fc_bh->b_data,
dst - (u8 *)sbi->s_fc_bh->b_data);
tail.fc_crc = cpu_to_le32(crc);
memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc));
@@ -893,15 +900,15 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
struct ext4_extent *ex;
int ret;
- mutex_lock(&ei->i_fc_lock);
+ spin_lock(&ei->i_fc_lock);
if (ei->i_fc_lblk_len == 0) {
- mutex_unlock(&ei->i_fc_lock);
+ spin_unlock(&ei->i_fc_lock);
return 0;
}
old_blk_size = ei->i_fc_lblk_start;
new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
ei->i_fc_lblk_len = 0;
- mutex_unlock(&ei->i_fc_lock);
+ spin_unlock(&ei->i_fc_lock);
cur_lblk_off = old_blk_size;
ext4_debug("will try writing %d to %d for inode %ld\n",
@@ -910,7 +917,9 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
while (cur_lblk_off <= new_blk_size) {
map.m_lblk = cur_lblk_off;
map.m_len = new_blk_size - cur_lblk_off + 1;
- ret = ext4_map_blocks(NULL, inode, &map, 0);
+ ret = ext4_map_blocks(NULL, inode, &map,
+ EXT4_GET_BLOCKS_IO_SUBMIT |
+ EXT4_EX_NOCACHE);
if (ret < 0)
return -ECANCELED;
@@ -954,69 +963,31 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
}
-/* Submit data for all the fast commit inodes */
-static int ext4_fc_submit_inode_data_all(journal_t *journal)
+/* Flushes data of all the inodes in the commit queue. */
+static int ext4_fc_flush_data(journal_t *journal)
{
struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_inode_info *ei;
int ret = 0;
- spin_lock(&sbi->s_fc_lock);
list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
- ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
- while (atomic_read(&ei->i_fc_updates)) {
- DEFINE_WAIT(wait);
-
- prepare_to_wait(&ei->i_fc_wait, &wait,
- TASK_UNINTERRUPTIBLE);
- if (atomic_read(&ei->i_fc_updates)) {
- spin_unlock(&sbi->s_fc_lock);
- schedule();
- spin_lock(&sbi->s_fc_lock);
- }
- finish_wait(&ei->i_fc_wait, &wait);
- }
- spin_unlock(&sbi->s_fc_lock);
ret = jbd2_submit_inode_data(journal, ei->jinode);
if (ret)
return ret;
- spin_lock(&sbi->s_fc_lock);
}
- spin_unlock(&sbi->s_fc_lock);
-
- return ret;
-}
-
-/* Wait for completion of data for all the fast commit inodes */
-static int ext4_fc_wait_inode_data_all(journal_t *journal)
-{
- struct super_block *sb = journal->j_private;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_inode_info *pos, *n;
- int ret = 0;
-
- spin_lock(&sbi->s_fc_lock);
- list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
- if (!ext4_test_inode_state(&pos->vfs_inode,
- EXT4_STATE_FC_COMMITTING))
- continue;
- spin_unlock(&sbi->s_fc_lock);
- ret = jbd2_wait_inode_data(journal, pos->jinode);
+ list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ ret = jbd2_wait_inode_data(journal, ei->jinode);
if (ret)
return ret;
- spin_lock(&sbi->s_fc_lock);
}
- spin_unlock(&sbi->s_fc_lock);
return 0;
}
/* Commit all the directory entry updates */
static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
-__acquires(&sbi->s_fc_lock)
-__releases(&sbi->s_fc_lock)
{
struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1030,26 +1001,22 @@ __releases(&sbi->s_fc_lock)
list_for_each_entry_safe(fc_dentry, fc_dentry_n,
&sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
- spin_unlock(&sbi->s_fc_lock);
- if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
- ret = -ENOSPC;
- goto lock_and_exit;
- }
- spin_lock(&sbi->s_fc_lock);
+ if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry))
+ return -ENOSPC;
continue;
}
/*
* With fcd_dilist we need not loop in sbi->s_fc_q to get the
- * corresponding inode pointer
+ * corresponding inode. Also, the corresponding inode could have been
+ * deleted, in which case, we don't need to do anything.
*/
- WARN_ON(list_empty(&fc_dentry->fcd_dilist));
+ if (list_empty(&fc_dentry->fcd_dilist))
+ continue;
ei = list_first_entry(&fc_dentry->fcd_dilist,
struct ext4_inode_info, i_fc_dilist);
inode = &ei->vfs_inode;
WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
- spin_unlock(&sbi->s_fc_lock);
-
/*
* We first write the inode and then the create dirent. This
* allows the recovery code to create an unnamed inode first
@@ -1059,23 +1026,14 @@ __releases(&sbi->s_fc_lock)
*/
ret = ext4_fc_write_inode(inode, crc);
if (ret)
- goto lock_and_exit;
-
+ return ret;
ret = ext4_fc_write_inode_data(inode, crc);
if (ret)
- goto lock_and_exit;
-
- if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
- ret = -ENOSPC;
- goto lock_and_exit;
- }
-
- spin_lock(&sbi->s_fc_lock);
+ return ret;
+ if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry))
+ return -ENOSPC;
}
return 0;
-lock_and_exit:
- spin_lock(&sbi->s_fc_lock);
- return ret;
}
static int ext4_fc_perform_commit(journal_t *journal)
@@ -1089,26 +1047,81 @@ static int ext4_fc_perform_commit(journal_t *journal)
int ret = 0;
u32 crc = 0;
- ret = ext4_fc_submit_inode_data_all(journal);
- if (ret)
- return ret;
+ /*
+ * Step 1: Mark all inodes on s_fc_q[MAIN] with
+ * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being
+ * freed until the data flush is over.
+ */
+ mutex_lock(&sbi->s_fc_lock);
+ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ ext4_set_inode_state(&iter->vfs_inode,
+ EXT4_STATE_FC_FLUSHING_DATA);
+ }
+ mutex_unlock(&sbi->s_fc_lock);
+
+ /* Step 2: Flush data for all the eligible inodes. */
+ ret = ext4_fc_flush_data(journal);
- ret = ext4_fc_wait_inode_data_all(journal);
+ /*
+ * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning
+ * any error from step 2. This ensures that waiters waiting on
+ * EXT4_STATE_FC_FLUSHING_DATA can resume.
+ */
+ mutex_lock(&sbi->s_fc_lock);
+ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ ext4_clear_inode_state(&iter->vfs_inode,
+ EXT4_STATE_FC_FLUSHING_DATA);
+#if (BITS_PER_LONG < 64)
+ wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA);
+#else
+ wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA);
+#endif
+ }
+
+ /*
+ * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before
+ * the waiter checks the bit. Pairs with implicit barrier in
+ * prepare_to_wait() in ext4_fc_del().
+ */
+ smp_mb();
+ mutex_unlock(&sbi->s_fc_lock);
+
+ /*
+ * If we encountered error in Step 2, return it now after clearing
+ * EXT4_STATE_FC_FLUSHING_DATA bit.
+ */
if (ret)
return ret;
+
+ /* Step 4: Mark all inodes as being committed. */
+ jbd2_journal_lock_updates(journal);
/*
- * If file system device is different from journal device, issue a cache
- * flush before we start writing fast commit blocks.
+ * The journal is now locked. No more handles can start and all the
+ * previous handles are now drained. We now mark the inodes on the
+ * commit queue as being committed.
+ */
+ mutex_lock(&sbi->s_fc_lock);
+ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ ext4_set_inode_state(&iter->vfs_inode,
+ EXT4_STATE_FC_COMMITTING);
+ }
+ mutex_unlock(&sbi->s_fc_lock);
+ jbd2_journal_unlock_updates(journal);
+
+ /*
+ * Step 5: If file system device is different from journal device,
+ * issue a cache flush before we start writing fast commit blocks.
*/
if (journal->j_fs_dev != journal->j_dev)
blkdev_issue_flush(journal->j_fs_dev);
blk_start_plug(&plug);
+ /* Step 6: Write fast commit blocks to disk. */
if (sbi->s_fc_bytes == 0) {
/*
- * Add a head tag only if this is the first fast commit
- * in this TID.
+ * Step 6.1: Add a head tag only if this is the first fast
+ * commit in this TID.
*/
head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
head.fc_tid = cpu_to_le32(
@@ -1120,32 +1133,30 @@ static int ext4_fc_perform_commit(journal_t *journal)
}
}
- spin_lock(&sbi->s_fc_lock);
+ /* Step 6.2: Now write all the dentry updates. */
+ mutex_lock(&sbi->s_fc_lock);
ret = ext4_fc_commit_dentry_updates(journal, &crc);
- if (ret) {
- spin_unlock(&sbi->s_fc_lock);
+ if (ret)
goto out;
- }
+ /* Step 6.3: Now write all the changed inodes to disk. */
list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
inode = &iter->vfs_inode;
if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
continue;
- spin_unlock(&sbi->s_fc_lock);
ret = ext4_fc_write_inode_data(inode, &crc);
if (ret)
goto out;
ret = ext4_fc_write_inode(inode, &crc);
if (ret)
goto out;
- spin_lock(&sbi->s_fc_lock);
}
- spin_unlock(&sbi->s_fc_lock);
-
+ /* Step 6.4: Finally write tail tag to conclude this fast commit. */
ret = ext4_fc_write_tail(sb, crc);
out:
+ mutex_unlock(&sbi->s_fc_lock);
blk_finish_plug(&plug);
return ret;
}
@@ -1191,6 +1202,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
int subtid = atomic_read(&sbi->s_fc_subtid);
int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
ktime_t start_time, commit_time;
+ int old_ioprio, journal_ioprio;
if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
return jbd2_complete_transaction(journal, commit_tid);
@@ -1198,6 +1210,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
trace_ext4_fc_commit_start(sb, commit_tid);
start_time = ktime_get();
+ old_ioprio = get_current_ioprio();
restart_fc:
ret = jbd2_fc_begin_commit(journal, commit_tid);
@@ -1228,6 +1241,15 @@ restart_fc:
goto fallback;
}
+ /*
+ * Now that we know that this thread is going to do a fast commit,
+ * elevate the priority to match that of the journal thread.
+ */
+ if (journal->j_task->io_context)
+ journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
+ else
+ journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
+ set_task_ioprio(current, journal_ioprio);
fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
ret = ext4_fc_perform_commit(journal);
if (ret < 0) {
@@ -1242,6 +1264,7 @@ restart_fc:
}
atomic_inc(&sbi->s_fc_subtid);
ret = jbd2_fc_end_commit(journal);
+ set_task_ioprio(current, old_ioprio);
/*
* weight the commit time higher than the average time so we
* don't react too strongly to vast changes in the commit time
@@ -1251,6 +1274,7 @@ restart_fc:
return ret;
fallback:
+ set_task_ioprio(current, old_ioprio);
ret = jbd2_fc_end_commit_fallback(journal);
ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
return ret;
@@ -1264,7 +1288,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
{
struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_inode_info *iter, *iter_n;
+ struct ext4_inode_info *ei;
struct ext4_fc_dentry_update *fc_dentry;
if (full && sbi->s_fc_bh)
@@ -1273,14 +1297,16 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
trace_ext4_fc_cleanup(journal, full, tid);
jbd2_fc_release_bufs(journal);
- spin_lock(&sbi->s_fc_lock);
- list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
- i_fc_list) {
- list_del_init(&iter->i_fc_list);
- ext4_clear_inode_state(&iter->vfs_inode,
+ mutex_lock(&sbi->s_fc_lock);
+ while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) {
+ ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN],
+ struct ext4_inode_info,
+ i_fc_list);
+ list_del_init(&ei->i_fc_list);
+ ext4_clear_inode_state(&ei->vfs_inode,
EXT4_STATE_FC_COMMITTING);
- if (tid_geq(tid, iter->i_sync_tid)) {
- ext4_fc_reset_inode(&iter->vfs_inode);
+ if (tid_geq(tid, ei->i_sync_tid)) {
+ ext4_fc_reset_inode(&ei->vfs_inode);
} else if (full) {
/*
* We are called after a full commit, inode has been
@@ -1291,15 +1317,19 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
* time in that case (and tid doesn't increase so
* tid check above isn't reliable).
*/
- list_add_tail(&EXT4_I(&iter->vfs_inode)->i_fc_list,
+ list_add_tail(&ei->i_fc_list,
&sbi->s_fc_q[FC_Q_STAGING]);
}
- /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
+ /*
+ * Make sure clearing of EXT4_STATE_FC_COMMITTING is
+ * visible before we send the wakeup. Pairs with implicit
+ * barrier in prepare_to_wait() in ext4_fc_track_inode().
+ */
smp_mb();
#if (BITS_PER_LONG < 64)
- wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
+ wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING);
#else
- wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
+ wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING);
#endif
}
@@ -1309,11 +1339,9 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
fcd_list);
list_del_init(&fc_dentry->fcd_list);
list_del_init(&fc_dentry->fcd_dilist);
- spin_unlock(&sbi->s_fc_lock);
release_dentry_name_snapshot(&fc_dentry->fcd_name);
kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
- spin_lock(&sbi->s_fc_lock);
}
list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
@@ -1328,7 +1356,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
if (full)
sbi->s_fc_bytes = 0;
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
trace_ext4_fc_stats(sb);
}
@@ -2105,13 +2133,13 @@ static int ext4_fc_replay_scan(journal_t *journal,
case EXT4_FC_TAG_INODE:
case EXT4_FC_TAG_PAD:
state->fc_cur_tag++;
- state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
+ state->fc_crc = ext4_chksum(state->fc_crc, cur,
EXT4_FC_TAG_BASE_LEN + tl.fc_len);
break;
case EXT4_FC_TAG_TAIL:
state->fc_cur_tag++;
memcpy(&tail, val, sizeof(tail));
- state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
+ state->fc_crc = ext4_chksum(state->fc_crc, cur,
EXT4_FC_TAG_BASE_LEN +
offsetof(struct ext4_fc_tail,
fc_crc));
@@ -2138,7 +2166,7 @@ static int ext4_fc_replay_scan(journal_t *journal,
break;
}
state->fc_cur_tag++;
- state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
+ state->fc_crc = ext4_chksum(state->fc_crc, cur,
EXT4_FC_TAG_BASE_LEN + tl.fc_len);
break;
default:
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index a5205149adba..93240e35ee36 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -377,7 +377,12 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
loff_t pos = iocb->ki_pos;
struct inode *inode = file_inode(iocb->ki_filp);
- if (!error && size && flags & IOMAP_DIO_UNWRITTEN)
+
+ if (!error && size && (flags & IOMAP_DIO_UNWRITTEN) &&
+ (iocb->ki_flags & IOCB_ATOMIC))
+ error = ext4_convert_unwritten_extents_atomic(NULL, inode, pos,
+ size);
+ else if (!error && size && flags & IOMAP_DIO_UNWRITTEN)
error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
if (error)
return error;
@@ -688,10 +693,12 @@ out:
static ssize_t
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
+ int ret;
struct inode *inode = file_inode(iocb->ki_filp);
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
#ifdef CONFIG_FS_DAX
if (IS_DAX(inode))
@@ -700,7 +707,6 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (iocb->ki_flags & IOCB_ATOMIC) {
size_t len = iov_iter_count(from);
- int ret;
if (len < EXT4_SB(inode->i_sb)->s_awu_min ||
len > EXT4_SB(inode->i_sb)->s_awu_max)
@@ -741,7 +747,7 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order)
bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
(vmf->vma->vm_flags & VM_SHARED);
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
- pfn_t pfn;
+ unsigned long pfn;
if (write) {
sb_start_pagefault(sb);
@@ -756,9 +762,6 @@ retry:
return VM_FAULT_SIGBUS;
}
} else {
- result = filemap_fsnotify_fault(vmf);
- if (unlikely(result))
- return result;
filemap_invalidate_lock_shared(mapping);
}
result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops);
@@ -801,27 +804,33 @@ static const struct vm_operations_struct ext4_file_vm_ops = {
.page_mkwrite = ext4_page_mkwrite,
};
-static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int ext4_file_mmap_prepare(struct vm_area_desc *desc)
{
+ int ret;
+ struct file *file = desc->file;
struct inode *inode = file->f_mapping->host;
struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ if (file->f_mode & FMODE_WRITE)
+ ret = ext4_emergency_state(inode->i_sb);
+ else
+ ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0;
+ if (unlikely(ret))
+ return ret;
/*
* We don't support synchronous mappings for non-DAX files and
* for DAX files if underneath dax_device is not synchronous.
*/
- if (!daxdev_mapping_supported(vma, dax_dev))
+ if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), dax_dev))
return -EOPNOTSUPP;
file_accessed(file);
if (IS_DAX(file_inode(file))) {
- vma->vm_ops = &ext4_dax_vm_ops;
- vm_flags_set(vma, VM_HUGEPAGE);
+ desc->vm_ops = &ext4_dax_vm_ops;
+ desc->vm_flags |= VM_HUGEPAGE;
} else {
- vma->vm_ops = &ext4_file_vm_ops;
+ desc->vm_ops = &ext4_file_vm_ops;
}
return 0;
}
@@ -838,7 +847,8 @@ static int ext4_sample_last_mounted(struct super_block *sb,
if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED)))
return 0;
- if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
+ if (ext4_emergency_state(sb) || sb_rdonly(sb) ||
+ !sb_start_intwrite_trylock(sb))
return 0;
ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED);
@@ -881,8 +891,12 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
{
int ret;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ if (filp->f_mode & FMODE_WRITE)
+ ret = ext4_emergency_state(inode->i_sb);
+ else
+ ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0;
+ if (unlikely(ret))
+ return ret;
ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt);
if (ret)
@@ -921,12 +935,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
- loff_t maxbytes;
-
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
- else
- maxbytes = inode->i_sb->s_maxbytes;
+ loff_t maxbytes = ext4_get_maxbytes(inode);
switch (whence) {
default:
@@ -960,7 +969,7 @@ const struct file_operations ext4_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
#endif
- .mmap = ext4_file_mmap,
+ .mmap_prepare = ext4_file_mmap_prepare,
.open = ext4_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
@@ -969,7 +978,8 @@ const struct file_operations ext4_file_operations = {
.splice_write = iter_file_splice_write,
.fallocate = ext4_fallocate,
.fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
- FOP_DIO_PARALLEL_WRITE,
+ FOP_DIO_PARALLEL_WRITE |
+ FOP_DONTCACHE,
};
const struct inode_operations ext4_file_inode_operations = {
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index 383c6edea6dd..91185c40f755 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -393,6 +393,14 @@ static unsigned int ext4_getfsmap_find_sb(struct super_block *sb,
/* Reserved GDT blocks */
if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) {
len = le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+
+ /*
+ * mkfs.ext4 can set s_reserved_gdt_blocks as 0 in some cases,
+ * check for that.
+ */
+ if (!len)
+ return 0;
+
error = ext4_getfsmap_fill(meta_list, fsb, len,
EXT4_FMR_OWN_RESV_GDT);
if (error)
@@ -526,6 +534,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
ext4_group_t end_ag;
ext4_grpblk_t first_cluster;
ext4_grpblk_t last_cluster;
+ struct ext4_fsmap irec;
int error = 0;
bofs = le32_to_cpu(sbi->s_es->s_first_data_block);
@@ -609,10 +618,18 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
goto err;
}
- /* Report any gaps at the end of the bg */
+ /*
+ * The dummy record below will cause ext4_getfsmap_helper() to report
+ * any allocated blocks at the end of the range.
+ */
+ irec.fmr_device = 0;
+ irec.fmr_physical = end_fsb + 1;
+ irec.fmr_length = 0;
+ irec.fmr_owner = EXT4_FMR_OWN_FREE;
+ irec.fmr_flags = 0;
+
info->gfi_last = true;
- error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster + 1,
- 0, info);
+ error = ext4_getfsmap_helper(sb, info, &irec);
if (error)
goto err;
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index b40d3b29f7e5..e476c6de3074 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -132,20 +132,16 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
bool needs_barrier = false;
struct inode *inode = file->f_mapping->host;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
ASSERT(ext4_journal_current_handle() == NULL);
trace_ext4_sync_file_enter(file, datasync);
- if (sb_rdonly(inode->i_sb)) {
- /* Make sure that we read updated s_ext4_flags value */
- smp_rmb();
- if (ext4_forced_shutdown(inode->i_sb))
- ret = -EROFS;
+ if (sb_rdonly(inode->i_sb))
goto out;
- }
if (!EXT4_SB(inode->i_sb)->s_journal) {
ret = ext4_fsync_nojournal(file, start, end, datasync,
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index deabe29da7fb..33cd5b6b02d5 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -302,7 +302,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
if (len && IS_CASEFOLDED(dir) &&
(!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) {
- buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL);
+ buff = kzalloc(PATH_MAX, GFP_KERNEL);
if (!buff)
return -ENOMEM;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 21d228073d79..df4051613b29 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -691,7 +691,8 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
if (!bh || !buffer_uptodate(bh))
/*
* If the block is not in the buffer cache, then it
- * must have been written out.
+ * must have been written out, or, most unlikely, is
+ * being migrated - false failure should be OK here.
*/
goto out;
@@ -951,8 +952,9 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
sb = dir->i_sb;
sbi = EXT4_SB(sb);
- if (unlikely(ext4_forced_shutdown(sb)))
- return ERR_PTR(-EIO);
+ ret2 = ext4_emergency_state(sb);
+ if (unlikely(ret2))
+ return ERR_PTR(ret2);
ngroups = ext4_get_groups_count(sb);
trace_ext4_request_inode(dir, mode);
@@ -1282,14 +1284,13 @@ got:
inode->i_generation = get_random_u32();
/* Precompute checksum seed for inode metadata */
- if (ext4_has_metadata_csum(sb)) {
+ if (ext4_has_feature_metadata_csum(sb)) {
__u32 csum;
__le32 inum = cpu_to_le32(inode->i_ino);
__le32 gen = cpu_to_le32(inode->i_generation);
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum,
sizeof(inum));
- ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
- sizeof(gen));
+ ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen));
}
ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
@@ -1298,7 +1299,7 @@ got:
ei->i_extra_isize = sbi->s_want_extra_isize;
ei->i_inline_off = 0;
if (ext4_has_feature_inline_data(sb) &&
- (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode)))
+ (!(ei->i_flags & (EXT4_DAX_FL|EXT4_EA_INODE_FL)) || S_ISDIR(mode)))
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
ret = inode;
err = dquot_alloc_inode(inode);
@@ -1334,6 +1335,8 @@ got:
}
}
+ ext4_set_inode_mapping_order(inode);
+
ext4_update_inode_fsync_trans(handle, inode, 1);
err = ext4_mark_inode_dirty(handle, inode);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 7de327fa7b1c..d45124318200 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -539,7 +539,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
int indirect_blks;
int blocks_to_boundary = 0;
int depth;
- int count = 0;
+ u64 count = 0;
ext4_fsblk_t first_block = 0;
trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
@@ -588,7 +588,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
count++;
/* Fill in size of a hole we found */
map->m_pblk = 0;
- map->m_len = min_t(unsigned int, map->m_len, count);
+ map->m_len = umin(map->m_len, count);
goto cleanup;
}
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 3536ca7e4fcc..1b094a4f3866 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -20,6 +20,11 @@
#define EXT4_INLINE_DOTDOT_OFFSET 2
#define EXT4_INLINE_DOTDOT_SIZE 4
+
+static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
+ struct inode *inode,
+ void **fsdata);
+
static int ext4_get_inline_size(struct inode *inode)
{
if (EXT4_I(inode)->i_inline_off)
@@ -228,7 +233,7 @@ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
struct ext4_inode *raw_inode;
int cp_len = 0;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
+ if (unlikely(ext4_emergency_state(inode->i_sb)))
return;
BUG_ON(!EXT4_I(inode)->i_inline_off);
@@ -298,7 +303,11 @@ static int ext4_create_inline_data(handle_t *handle,
if (error)
goto out;
- BUG_ON(!is.s.not_found);
+ if (!is.s.not_found) {
+ EXT4_ERROR_INODE(inode, "unexpected inline data xattr");
+ error = -EFSCORRUPTED;
+ goto out;
+ }
error = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (error) {
@@ -349,7 +358,11 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
if (error)
goto out;
- BUG_ON(is.s.not_found);
+ if (is.s.not_found) {
+ EXT4_ERROR_INODE(inode, "missing inline data xattr");
+ error = -EFSCORRUPTED;
+ goto out;
+ }
len -= EXT4_MIN_INLINE_DATA_SIZE;
value = kzalloc(len, GFP_NOFS);
@@ -392,7 +405,7 @@ out:
}
static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
- unsigned int len)
+ loff_t len)
{
int ret, size, no_expand;
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -557,7 +570,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
return 0;
}
- needed_blocks = ext4_writepage_trans_blocks(inode);
+ needed_blocks = ext4_chunk_trans_extent(inode, 1);
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
@@ -596,6 +609,7 @@ retry:
goto out;
}
+ ext4_fc_track_inode(handle, inode);
ret = ext4_destroy_inline_data_nolock(handle, inode);
if (ret)
goto out;
@@ -606,6 +620,7 @@ retry:
} else
ret = ext4_block_write_begin(handle, folio, from, to,
ext4_get_block);
+ clear_buffer_new(folio_buffers(folio));
if (!ret && ext4_should_journal_data(inode)) {
ret = ext4_walk_page_buffers(handle, inode,
@@ -637,7 +652,7 @@ retry:
goto retry;
if (folio)
- block_commit_write(&folio->page, from, to);
+ block_commit_write(folio, from, to);
out:
if (folio) {
folio_unlock(folio);
@@ -653,91 +668,109 @@ out_nofolio:
}
/*
- * Try to write data in the inode.
- * If the inode has inline data, check whether the new write can be
- * in the inode also. If not, create the page the handle, move the data
- * to the page make it update and let the later codes create extent for it.
+ * Prepare the write for the inline data.
+ * If the data can be written into the inode, we just read
+ * the page and make it uptodate, and start the journal.
+ * Otherwise read the page, makes it dirty so that it can be
+ * handle in writepages(the i_disksize update is left to the
+ * normal ext4_da_write_end).
*/
-int ext4_try_to_write_inline_data(struct address_space *mapping,
- struct inode *inode,
- loff_t pos, unsigned len,
- struct folio **foliop)
+int ext4_generic_write_inline_data(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ struct folio **foliop,
+ void **fsdata, bool da)
{
int ret;
handle_t *handle;
struct folio *folio;
struct ext4_iloc iloc;
-
- if (pos + len > ext4_get_max_inline_size(inode))
- goto convert;
+ int retries = 0;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
return ret;
- /*
- * The possible write could happen in the inode,
- * so try to reserve the space in inode first.
- */
+retry_journal:
handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- handle = NULL;
- goto out;
+ goto out_release_bh;
}
ret = ext4_prepare_inline_data(handle, inode, pos + len);
if (ret && ret != -ENOSPC)
- goto out;
+ goto out_stop_journal;
- /* We don't have space in inline inode, so convert it to extent. */
if (ret == -ENOSPC) {
ext4_journal_stop(handle);
- brelse(iloc.bh);
- goto convert;
- }
+ if (!da) {
+ brelse(iloc.bh);
+ /* Retry inside */
+ return ext4_convert_inline_data_to_extent(mapping, inode);
+ }
- ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh,
- EXT4_JTR_NONE);
- if (ret)
- goto out;
+ ret = ext4_da_convert_inline_data_to_extent(mapping, inode, fsdata);
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_journal;
+ goto out_release_bh;
+ }
folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
mapping_gfp_mask(mapping));
if (IS_ERR(folio)) {
ret = PTR_ERR(folio);
- goto out;
+ goto out_stop_journal;
}
- *foliop = folio;
down_read(&EXT4_I(inode)->xattr_sem);
+ /* Someone else had converted it to extent */
if (!ext4_has_inline_data(inode)) {
ret = 0;
- folio_unlock(folio);
- folio_put(folio);
- goto out_up_read;
+ goto out_release_folio;
}
if (!folio_test_uptodate(folio)) {
ret = ext4_read_inline_folio(inode, folio);
- if (ret < 0) {
- folio_unlock(folio);
- folio_put(folio);
- goto out_up_read;
- }
+ if (ret < 0)
+ goto out_release_folio;
}
- ret = 1;
- handle = NULL;
-out_up_read:
+ ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE);
+ if (ret)
+ goto out_release_folio;
+ *foliop = folio;
up_read(&EXT4_I(inode)->xattr_sem);
-out:
- if (handle && (ret != 1))
- ext4_journal_stop(handle);
+ brelse(iloc.bh);
+ return 1;
+
+out_release_folio:
+ up_read(&EXT4_I(inode)->xattr_sem);
+ folio_unlock(folio);
+ folio_put(folio);
+out_stop_journal:
+ ext4_journal_stop(handle);
+out_release_bh:
brelse(iloc.bh);
return ret;
-convert:
- return ext4_convert_inline_data_to_extent(mapping, inode);
+}
+
+/*
+ * Try to write data in the inode.
+ * If the inode has inline data, check whether the new write can be
+ * in the inode also. If not, create the page the handle, move the data
+ * to the page make it update and let the later codes create extent for it.
+ */
+int ext4_try_to_write_inline_data(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ struct folio **foliop)
+{
+ if (pos + len > ext4_get_max_inline_size(inode))
+ return ext4_convert_inline_data_to_extent(mapping, inode);
+ return ext4_generic_write_inline_data(mapping, inode, pos, len,
+ foliop, NULL, false);
}
int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
@@ -867,6 +900,7 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
return ret;
}
+ clear_buffer_new(folio_buffers(folio));
folio_mark_dirty(folio);
folio_mark_uptodate(folio);
ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
@@ -881,94 +915,6 @@ out:
return ret;
}
-/*
- * Prepare the write for the inline data.
- * If the data can be written into the inode, we just read
- * the page and make it uptodate, and start the journal.
- * Otherwise read the page, makes it dirty so that it can be
- * handle in writepages(the i_disksize update is left to the
- * normal ext4_da_write_end).
- */
-int ext4_da_write_inline_data_begin(struct address_space *mapping,
- struct inode *inode,
- loff_t pos, unsigned len,
- struct folio **foliop,
- void **fsdata)
-{
- int ret;
- handle_t *handle;
- struct folio *folio;
- struct ext4_iloc iloc;
- int retries = 0;
-
- ret = ext4_get_inode_loc(inode, &iloc);
- if (ret)
- return ret;
-
-retry_journal:
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
-
- ret = ext4_prepare_inline_data(handle, inode, pos + len);
- if (ret && ret != -ENOSPC)
- goto out_journal;
-
- if (ret == -ENOSPC) {
- ext4_journal_stop(handle);
- ret = ext4_da_convert_inline_data_to_extent(mapping,
- inode,
- fsdata);
- if (ret == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry_journal;
- goto out;
- }
-
- /*
- * We cannot recurse into the filesystem as the transaction
- * is already started.
- */
- folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
- mapping_gfp_mask(mapping));
- if (IS_ERR(folio)) {
- ret = PTR_ERR(folio);
- goto out_journal;
- }
-
- down_read(&EXT4_I(inode)->xattr_sem);
- if (!ext4_has_inline_data(inode)) {
- ret = 0;
- goto out_release_page;
- }
-
- if (!folio_test_uptodate(folio)) {
- ret = ext4_read_inline_folio(inode, folio);
- if (ret < 0)
- goto out_release_page;
- }
- ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh,
- EXT4_JTR_NONE);
- if (ret)
- goto out_release_page;
-
- up_read(&EXT4_I(inode)->xattr_sem);
- *foliop = folio;
- brelse(iloc.bh);
- return 1;
-out_release_page:
- up_read(&EXT4_I(inode)->xattr_sem);
- folio_unlock(folio);
- folio_put(folio);
-out_journal:
- ext4_journal_stop(handle);
-out:
- brelse(iloc.bh);
- return ret;
-}
-
#ifdef INLINE_DIR_DEBUG
void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
void *inline_start, int inline_size)
@@ -1012,7 +958,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
int err;
struct ext4_dir_entry_2 *de;
- err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start,
+ err = ext4_find_dest_de(dir, iloc->bh, inline_start,
inline_size, fname, &de);
if (err)
return err;
@@ -1059,7 +1005,7 @@ static void *ext4_get_inline_xattr_pos(struct inode *inode,
}
/* Set the final de to cover the whole block. */
-static void ext4_update_final_de(void *de_buf, int old_size, int new_size)
+void ext4_update_final_de(void *de_buf, int old_size, int new_size)
{
struct ext4_dir_entry_2 *de, *prev_de;
void *limit;
@@ -1123,51 +1069,6 @@ static void ext4_restore_inline_data(handle_t *handle, struct inode *inode,
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
}
-static int ext4_finish_convert_inline_dir(handle_t *handle,
- struct inode *inode,
- struct buffer_head *dir_block,
- void *buf,
- int inline_size)
-{
- int err, csum_size = 0, header_size = 0;
- struct ext4_dir_entry_2 *de;
- void *target = dir_block->b_data;
-
- /*
- * First create "." and ".." and then copy the dir information
- * back to the block.
- */
- de = target;
- de = ext4_init_dot_dotdot(inode, de,
- inode->i_sb->s_blocksize, csum_size,
- le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1);
- header_size = (void *)de - target;
-
- memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
- inline_size - EXT4_INLINE_DOTDOT_SIZE);
-
- if (ext4_has_metadata_csum(inode->i_sb))
- csum_size = sizeof(struct ext4_dir_entry_tail);
-
- inode->i_size = inode->i_sb->s_blocksize;
- i_size_write(inode, inode->i_sb->s_blocksize);
- EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
- ext4_update_final_de(dir_block->b_data,
- inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size,
- inode->i_sb->s_blocksize - csum_size);
-
- if (csum_size)
- ext4_initialize_dirent_tail(dir_block,
- inode->i_sb->s_blocksize);
- set_buffer_uptodate(dir_block);
- unlock_buffer(dir_block);
- err = ext4_handle_dirty_dirblock(handle, inode, dir_block);
- if (err)
- return err;
- set_buffer_verified(dir_block);
- return ext4_mark_inode_dirty(handle, inode);
-}
-
static int ext4_convert_inline_data_nolock(handle_t *handle,
struct inode *inode,
struct ext4_iloc *iloc)
@@ -1239,8 +1140,17 @@ static int ext4_convert_inline_data_nolock(handle_t *handle,
error = ext4_handle_dirty_metadata(handle,
inode, data_bh);
} else {
- error = ext4_finish_convert_inline_dir(handle, inode, data_bh,
- buf, inline_size);
+ unlock_buffer(data_bh);
+ inode->i_size = inode->i_sb->s_blocksize;
+ i_size_write(inode, inode->i_sb->s_blocksize);
+ EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+
+ error = ext4_init_dirblock(handle, inode, data_bh,
+ le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode),
+ buf + EXT4_INLINE_DOTDOT_SIZE,
+ inline_size - EXT4_INLINE_DOTDOT_SIZE);
+ if (!error)
+ error = ext4_mark_inode_dirty(handle, inode);
}
out_restore:
@@ -1379,7 +1289,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
if (pos == 0) {
fake.inode = cpu_to_le32(inode->i_ino);
fake.name_len = 1;
- strcpy(fake.name, ".");
+ memcpy(fake.name, ".", 2);
fake.rec_len = ext4_rec_len_to_disk(
ext4_dir_rec_len(fake.name_len, NULL),
inline_size);
@@ -1389,7 +1299,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
} else if (pos == EXT4_INLINE_DOTDOT_OFFSET) {
fake.inode = cpu_to_le32(parent_ino);
fake.name_len = 2;
- strcpy(fake.name, "..");
+ memcpy(fake.name, "..", 3);
fake.rec_len = ext4_rec_len_to_disk(
ext4_dir_rec_len(fake.name_len, NULL),
inline_size);
@@ -1928,7 +1838,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
};
- needed_blocks = ext4_writepage_trans_blocks(inode);
+ needed_blocks = ext4_chunk_trans_extent(inode, 1);
handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -1967,7 +1877,12 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0)
goto out_error;
- BUG_ON(is.s.not_found);
+ if (is.s.not_found) {
+ EXT4_ERROR_INODE(inode,
+ "missing inline data xattr");
+ err = -EFSCORRUPTED;
+ goto out_error;
+ }
value_len = le32_to_cpu(is.s.here->e_value_size);
value = kmalloc(value_len, GFP_NOFS);
@@ -2043,7 +1958,7 @@ int ext4_convert_inline_data(struct inode *inode)
return 0;
}
- needed_blocks = ext4_writepage_trans_blocks(inode);
+ needed_blocks = ext4_chunk_trans_extent(inode, 1);
iloc.bh = NULL;
error = ext4_get_inode_loc(inode, &iloc);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7c54ae5fcbd4..5b7a15db4953 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -31,6 +31,7 @@
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/mpage.h>
+#include <linux/rmap.h>
#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
@@ -57,29 +58,27 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle,
static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
struct ext4_inode_info *ei)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum;
__u16 dummy_csum = 0;
int offset = offsetof(struct ext4_inode, i_checksum_lo);
unsigned int csum_size = sizeof(dummy_csum);
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset);
- csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size);
+ csum = ext4_chksum(ei->i_csum_seed, (__u8 *)raw, offset);
+ csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size);
offset += csum_size;
- csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
+ csum = ext4_chksum(csum, (__u8 *)raw + offset,
EXT4_GOOD_OLD_INODE_SIZE - offset);
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
offset = offsetof(struct ext4_inode, i_checksum_hi);
- csum = ext4_chksum(sbi, csum, (__u8 *)raw +
- EXT4_GOOD_OLD_INODE_SIZE,
+ csum = ext4_chksum(csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE,
offset - EXT4_GOOD_OLD_INODE_SIZE);
if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
- csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum,
+ csum = ext4_chksum(csum, (__u8 *)&dummy_csum,
csum_size);
offset += csum_size;
}
- csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
+ csum = ext4_chksum(csum, (__u8 *)raw + offset,
EXT4_INODE_SIZE(inode->i_sb) - offset);
}
@@ -93,7 +92,7 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
cpu_to_le32(EXT4_OS_LINUX) ||
- !ext4_has_metadata_csum(inode->i_sb))
+ !ext4_has_feature_metadata_csum(inode->i_sb))
return 1;
provided = le16_to_cpu(raw->i_checksum_lo);
@@ -114,7 +113,7 @@ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
cpu_to_le32(EXT4_OS_LINUX) ||
- !ext4_has_metadata_csum(inode->i_sb))
+ !ext4_has_feature_metadata_csum(inode->i_sb))
return;
csum = ext4_inode_csum(inode, raw, ei);
@@ -141,16 +140,13 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
new_size);
}
-static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
- int pextents);
-
/*
* Test whether an inode is a fast symlink.
* A fast symlink has its symlink data stored in ext4_inode_info->i_data.
*/
int ext4_inode_is_fast_symlink(struct inode *inode)
{
- if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+ if (!ext4_has_feature_ea_inode(inode->i_sb)) {
int ea_blocks = EXT4_I(inode)->i_file_acl ?
EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
@@ -181,6 +177,8 @@ void ext4_evict_inode(struct inode *inode)
trace_ext4_evict_inode(inode);
+ dax_break_layout_final(inode);
+
if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
ext4_evict_ea_inode(inode);
if (inode->i_nlink) {
@@ -383,10 +381,11 @@ static int __check_block_validity(struct inode *inode, const char *func,
unsigned int line,
struct ext4_map_blocks *map)
{
- if (ext4_has_feature_journal(inode->i_sb) &&
- (inode->i_ino ==
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+
+ if (journal && inode == journal->j_inode)
return 0;
+
if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
ext4_error_inode(inode, func, line, map->m_pblk,
"lblock %lu mapped to illegal pblock %llu "
@@ -412,6 +411,32 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
return ret;
}
+/*
+ * For generic regular files, when updating the extent tree, Ext4 should
+ * hold the i_rwsem and invalidate_lock exclusively. This ensures
+ * exclusion against concurrent page faults, as well as reads and writes.
+ */
+#ifdef CONFIG_EXT4_DEBUG
+void ext4_check_map_extents_env(struct inode *inode)
+{
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return;
+
+ if (!S_ISREG(inode->i_mode) ||
+ IS_NOQUOTA(inode) || IS_VERITY(inode) ||
+ is_special_ino(inode->i_sb, inode->i_ino) ||
+ (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) ||
+ ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
+ ext4_verity_in_progress(inode))
+ return;
+
+ WARN_ON_ONCE(!inode_is_locked(inode) &&
+ !rwsem_is_locked(&inode->i_mapping->invalidate_lock));
+}
+#else
+void ext4_check_map_extents_env(struct inode *inode) {}
+#endif
+
#define check_block_validity(inode, map) \
__check_block_validity((inode), __func__, __LINE__, (map))
@@ -458,16 +483,73 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
}
#endif /* ES_AGGRESSIVE_TEST */
+static int ext4_map_query_blocks_next_in_leaf(handle_t *handle,
+ struct inode *inode, struct ext4_map_blocks *map,
+ unsigned int orig_mlen)
+{
+ struct ext4_map_blocks map2;
+ unsigned int status, status2;
+ int retval;
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+
+ WARN_ON_ONCE(!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF));
+ WARN_ON_ONCE(orig_mlen <= map->m_len);
+
+ /* Prepare map2 for lookup in next leaf block */
+ map2.m_lblk = map->m_lblk + map->m_len;
+ map2.m_len = orig_mlen - map->m_len;
+ map2.m_flags = 0;
+ retval = ext4_ext_map_blocks(handle, inode, &map2, 0);
+
+ if (retval <= 0) {
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status, false);
+ return map->m_len;
+ }
+
+ if (unlikely(retval != map2.m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map2.m_len);
+ WARN_ON(1);
+ }
+
+ status2 = map2.m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+
+ /*
+ * If map2 is contiguous with map, then let's insert it as a single
+ * extent in es cache and return the combined length of both the maps.
+ */
+ if (map->m_pblk + map->m_len == map2.m_pblk &&
+ status == status2) {
+ ext4_es_insert_extent(inode, map->m_lblk,
+ map->m_len + map2.m_len, map->m_pblk,
+ status, false);
+ map->m_len += map2.m_len;
+ } else {
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status, false);
+ }
+
+ return map->m_len;
+}
+
static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
- struct ext4_map_blocks *map)
+ struct ext4_map_blocks *map, int flags)
{
unsigned int status;
int retval;
+ unsigned int orig_mlen = map->m_len;
+ flags &= EXT4_EX_QUERY_FILTER;
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- retval = ext4_ext_map_blocks(handle, inode, map, 0);
+ retval = ext4_ext_map_blocks(handle, inode, map, flags);
else
- retval = ext4_ind_map_blocks(handle, inode, map, 0);
+ retval = ext4_ind_map_blocks(handle, inode, map, flags);
if (retval <= 0)
return retval;
@@ -480,11 +562,22 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
WARN_ON(1);
}
- status = map->m_flags & EXT4_MAP_UNWRITTEN ?
- EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status, false);
- return retval;
+ /*
+ * No need to query next in leaf:
+ * - if returned extent is not last in leaf or
+ * - if the last in leaf is the full requested range
+ */
+ if (!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) ||
+ map->m_len == orig_mlen) {
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status, false);
+ return retval;
+ }
+
+ return ext4_map_query_blocks_next_in_leaf(handle, inode, map,
+ orig_mlen);
}
static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
@@ -598,6 +691,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct extent_status es;
int retval;
int ret = 0;
+ unsigned int orig_mlen = map->m_len;
#ifdef ES_AGGRESSIVE_TEST
struct ext4_map_blocks orig_map;
@@ -618,9 +712,18 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
return -EFSCORRUPTED;
+ /*
+ * Callers from the context of data submission are the only exceptions
+ * for regular files that do not hold the i_rwsem or invalidate_lock.
+ * However, caching unrelated ranges is not permitted.
+ */
+ if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
+ WARN_ON_ONCE(!(flags & EXT4_EX_NOCACHE));
+ else
+ ext4_check_map_extents_env(inode);
+
/* Lookup extent status tree firstly */
- if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
- ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
map->m_pblk = ext4_es_pblock(&es) +
map->m_lblk - es.es_lblk;
@@ -649,7 +752,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
ext4_map_blocks_es_recheck(handle, inode, map,
&orig_map, flags);
#endif
- goto found;
+ if (!(flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) ||
+ orig_mlen == map->m_len)
+ goto found;
+
+ map->m_len = orig_mlen;
}
/*
* In the query cache no-wait mode, nothing we can do more if we
@@ -663,7 +770,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* file system block.
*/
down_read(&EXT4_I(inode)->i_data_sem);
- retval = ext4_map_query_blocks(handle, inode, map);
+ retval = ext4_map_query_blocks(handle, inode, map, flags);
up_read((&EXT4_I(inode)->i_data_sem));
found:
@@ -692,6 +799,8 @@ found:
if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
return retval;
+
+ ext4_fc_track_inode(handle, inode);
/*
* New blocks allocate and/or writing to unwritten extent
* will possibly result in updating i_data, so we take
@@ -751,7 +860,7 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
flags &= EXT4_MAP_FLAGS;
/* Dummy buffer_head? Set non-atomically. */
- if (!bh->b_page) {
+ if (!bh->b_folio) {
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags;
return;
}
@@ -766,6 +875,26 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
} while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state)));
}
+/*
+ * Make sure that the current journal transaction has enough credits to map
+ * one extent. Return -EAGAIN if it cannot extend the current running
+ * transaction.
+ */
+static inline int ext4_journal_ensure_extent_credits(handle_t *handle,
+ struct inode *inode)
+{
+ int credits;
+ int ret;
+
+ /* Called from ext4_da_write_begin() which has no handle started? */
+ if (!handle)
+ return 0;
+
+ credits = ext4_chunk_trans_blocks(inode, 1);
+ ret = __ext4_journal_ensure_credits(handle, credits, credits, 0);
+ return ret <= 0 ? ret : -EAGAIN;
+}
+
static int _ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int flags)
{
@@ -1005,7 +1134,12 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode,
*/
static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh)
{
- folio_mark_dirty(bh->b_folio);
+ struct folio *folio = bh->b_folio;
+ struct inode *inode = folio->mapping->host;
+
+ /* only regular files have a_ops */
+ if (S_ISREG(inode->i_mode))
+ folio_mark_dirty(folio);
return ext4_handle_dirty_metadata(handle, NULL, bh);
}
@@ -1023,7 +1157,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio,
loff_t pos, unsigned len,
get_block_t *get_block)
{
- unsigned from = pos & (PAGE_SIZE - 1);
+ unsigned int from = offset_in_folio(folio, pos);
unsigned to = from + len;
struct inode *inode = folio->mapping->host;
unsigned block_start, block_end;
@@ -1037,8 +1171,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio,
bool should_journal_data = ext4_should_journal_data(inode);
BUG_ON(!folio_test_locked(folio));
- BUG_ON(from > PAGE_SIZE);
- BUG_ON(to > PAGE_SIZE);
+ BUG_ON(to > folio_size(folio));
BUG_ON(from > to);
head = folio_buffers(folio);
@@ -1056,11 +1189,13 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio,
}
continue;
}
- if (buffer_new(bh))
+ if (WARN_ON_ONCE(buffer_new(bh)))
clear_buffer_new(bh);
if (!buffer_mapped(bh)) {
WARN_ON(bh->b_size != blocksize);
- err = get_block(inode, block, bh, 1);
+ err = ext4_journal_ensure_extent_credits(handle, inode);
+ if (!err)
+ err = get_block(inode, block, bh, 1);
if (err)
break;
if (buffer_new(bh)) {
@@ -1137,7 +1272,8 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio,
* and the ext4_write_end(). So doing the jbd2_journal_start at the start of
* ext4_write_begin() is the right place.
*/
-static int ext4_write_begin(struct file *file, struct address_space *mapping,
+static int ext4_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
@@ -1149,18 +1285,18 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index;
unsigned from, to;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
trace_ext4_write_begin(inode, pos, len);
/*
* Reserve one block more for addition to orphan list in case
* we allocate blocks but write fails for some reason
*/
- needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
+ needed_blocks = ext4_chunk_trans_extent(inode,
+ ext4_journal_blocks_per_folio(inode)) + 1;
index = pos >> PAGE_SHIFT;
- from = pos & (PAGE_SIZE - 1);
- to = from + len;
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
@@ -1172,17 +1308,23 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
}
/*
- * __filemap_get_folio() can take a long time if the
+ * write_begin_get_folio() can take a long time if the
* system is thrashing due to memory pressure, or if the folio
* is being written back. So grab it first before we start
* the transaction handle. This also allows us to allocate
* the folio (if needed) without using GFP_NOFS.
*/
retry_grab:
- folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
- mapping_gfp_mask(mapping));
+ folio = write_begin_get_folio(iocb, mapping, index, len);
if (IS_ERR(folio))
return PTR_ERR(folio);
+
+ if (pos + len > folio_pos(folio) + folio_size(folio))
+ len = folio_pos(folio) + folio_size(folio) - pos;
+
+ from = offset_in_folio(folio, pos);
+ to = from + len;
+
/*
* The same as page allocation, we prealloc buffer heads before
* starting the handle.
@@ -1251,8 +1393,9 @@ retry_journal:
ext4_orphan_del(NULL, inode);
}
- if (ret == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries))
+ if (ret == -EAGAIN ||
+ (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries)))
goto retry_journal;
folio_put(folio);
return ret;
@@ -1272,17 +1415,18 @@ static int write_end_fn(handle_t *handle, struct inode *inode,
ret = ext4_dirty_journalled_data(handle, bh);
clear_buffer_meta(bh);
clear_buffer_prio(bh);
+ clear_buffer_new(bh);
return ret;
}
/*
* We need to pick up the new inode size which generic_commit_write gave us
- * `file' can be NULL - eg, when called from page_symlink().
+ * `iocb` can be NULL - eg, when called from page_symlink().
*
* ext4 never places buffers on inode->i_mapping->i_private_list. metadata
* buffers are managed internally.
*/
-static int ext4_write_end(struct file *file,
+static int ext4_write_end(const struct kiocb *iocb,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
@@ -1301,7 +1445,7 @@ static int ext4_write_end(struct file *file,
return ext4_write_inline_data_end(inode, pos, len, copied,
folio);
- copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ copied = block_write_end(pos, len, copied, folio);
/*
* it's important to update i_size while still holding folio lock:
* page writeout could otherwise come in and zero beyond i_size.
@@ -1387,7 +1531,7 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle,
} while (bh != head);
}
-static int ext4_journalled_write_end(struct file *file,
+static int ext4_journalled_write_end(const struct kiocb *iocb,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
@@ -1544,11 +1688,12 @@ struct mpage_da_data {
unsigned int can_map:1; /* Can writepages call map blocks? */
/* These are internal state of ext4_do_writepages() */
- pgoff_t first_page; /* The first page to write */
- pgoff_t next_page; /* Current page to examine */
- pgoff_t last_page; /* Last page to examine */
+ loff_t start_pos; /* The start pos to write */
+ loff_t next_pos; /* Current pos to examine */
+ loff_t end_pos; /* Last pos to examine */
+
/*
- * Extent to map - this can be after first_page because that can be
+ * Extent to map - this can be after start_pos because that can be
* fully mapped. We somewhat abuse m_flags to store whether the extent
* is delalloc or unwritten.
*/
@@ -1568,38 +1713,38 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
struct inode *inode = mpd->inode;
struct address_space *mapping = inode->i_mapping;
- /* This is necessary when next_page == 0. */
- if (mpd->first_page >= mpd->next_page)
+ /* This is necessary when next_pos == 0. */
+ if (mpd->start_pos >= mpd->next_pos)
return;
mpd->scanned_until_end = 0;
- index = mpd->first_page;
- end = mpd->next_page - 1;
if (invalidate) {
ext4_lblk_t start, last;
- start = index << (PAGE_SHIFT - inode->i_blkbits);
- last = end << (PAGE_SHIFT - inode->i_blkbits);
+ start = EXT4_B_TO_LBLK(inode, mpd->start_pos);
+ last = mpd->next_pos >> inode->i_blkbits;
/*
* avoid racing with extent status tree scans made by
* ext4_insert_delayed_block()
*/
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_es_remove_extent(inode, start, last - start + 1);
+ ext4_es_remove_extent(inode, start, last - start);
up_write(&EXT4_I(inode)->i_data_sem);
}
folio_batch_init(&fbatch);
- while (index <= end) {
- nr = filemap_get_folios(mapping, &index, end, &fbatch);
+ index = mpd->start_pos >> PAGE_SHIFT;
+ end = mpd->next_pos >> PAGE_SHIFT;
+ while (index < end) {
+ nr = filemap_get_folios(mapping, &index, end - 1, &fbatch);
if (nr == 0)
break;
for (i = 0; i < nr; i++) {
struct folio *folio = fbatch.folios[i];
- if (folio->index < mpd->first_page)
+ if (folio_pos(folio) < mpd->start_pos)
continue;
- if (folio_next_index(folio) - 1 > end)
+ if (folio_next_index(folio) > end)
continue;
BUG_ON(!folio_test_locked(folio));
BUG_ON(folio_test_writeback(folio));
@@ -1760,6 +1905,8 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len,
(unsigned long) map->m_lblk);
+ ext4_check_map_extents_env(inode);
+
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
map->m_len = min_t(unsigned int, map->m_len,
@@ -1800,7 +1947,7 @@ found:
if (ext4_has_inline_data(inode))
retval = 0;
else
- retval = ext4_map_query_blocks(NULL, inode, map);
+ retval = ext4_map_query_blocks(NULL, inode, map, 0);
up_read(&EXT4_I(inode)->i_data_sem);
if (retval)
return retval < 0 ? retval : 0;
@@ -1823,7 +1970,7 @@ add_delayed:
goto found;
}
} else if (!ext4_has_inline_data(inode)) {
- retval = ext4_map_query_blocks(NULL, inode, map);
+ retval = ext4_map_query_blocks(NULL, inode, map, 0);
if (retval) {
up_write(&EXT4_I(inode)->i_data_sem);
return retval < 0 ? retval : 0;
@@ -1899,7 +2046,8 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio)
{
- mpd->first_page += folio_nr_pages(folio);
+ mpd->start_pos += folio_size(folio);
+ mpd->wbc->nr_to_write -= folio_nr_pages(folio);
folio_unlock(folio);
}
@@ -1909,7 +2057,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
loff_t size;
int err;
- BUG_ON(folio->index != mpd->first_page);
+ WARN_ON_ONCE(folio_pos(folio) != mpd->start_pos);
folio_clear_dirty_for_io(folio);
/*
* We have to be very careful here! Nothing protects writeback path
@@ -1930,8 +2078,6 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
!ext4_verity_in_progress(mpd->inode))
len = size & (len - 1);
err = ext4_bio_write_folio(&mpd->io_submit, folio, len);
- if (!err)
- mpd->wbc->nr_to_write--;
return err;
}
@@ -2154,7 +2300,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
start = mpd->map.m_lblk >> bpp_bits;
end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
- lblk = start << bpp_bits;
pblock = mpd->map.m_pblk;
folio_batch_init(&fbatch);
@@ -2165,6 +2310,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
for (i = 0; i < nr; i++) {
struct folio *folio = fbatch.folios[i];
+ lblk = folio->index << bpp_bits;
err = mpage_process_folio(mpd, folio, &lblk, &pblock,
&map_bh);
/*
@@ -2198,6 +2344,11 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
int get_blocks_flags;
int err, dioread_nolock;
+ /* Make sure transaction has enough credits for this extent */
+ err = ext4_journal_ensure_extent_credits(handle, inode);
+ if (err < 0)
+ return err;
+
trace_ext4_da_write_pages_extent(inode, map);
/*
* Call ext4_map_blocks() to allocate any delayed allocation blocks, or
@@ -2207,11 +2358,15 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
* previously reserved. However we must not fail because we're in
* writeback and there is nothing we can do about it so it might result
* in data loss. So use reserved blocks to allocate metadata if
- * possible.
+ * possible. In addition, do not cache any unrelated extents, as it
+ * only holds the folio lock but does not hold the i_rwsem or
+ * invalidate_lock, which could corrupt the extent status tree.
*/
get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
EXT4_GET_BLOCKS_METADATA_NOFAIL |
- EXT4_GET_BLOCKS_IO_SUBMIT;
+ EXT4_GET_BLOCKS_IO_SUBMIT |
+ EXT4_EX_NOCACHE;
+
dioread_nolock = ext4_should_dioread_nolock(inode);
if (dioread_nolock)
get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
@@ -2225,7 +2380,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
mpd->io_submit.io_end->handle = handle->h_rsv_handle;
handle->h_rsv_handle = NULL;
}
- ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
+ ext4_set_io_unwritten_flag(mpd->io_submit.io_end);
}
BUG_ON(map->m_len == 0);
@@ -2233,6 +2388,47 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
}
/*
+ * This is used to submit mapped buffers in a single folio that is not fully
+ * mapped for various reasons, such as insufficient space or journal credits.
+ */
+static int mpage_submit_partial_folio(struct mpage_da_data *mpd)
+{
+ struct inode *inode = mpd->inode;
+ struct folio *folio;
+ loff_t pos;
+ int ret;
+
+ folio = filemap_get_folio(inode->i_mapping,
+ mpd->start_pos >> PAGE_SHIFT);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ /*
+ * The mapped position should be within the current processing folio
+ * but must not be the folio start position.
+ */
+ pos = ((loff_t)mpd->map.m_lblk) << inode->i_blkbits;
+ if (WARN_ON_ONCE((folio_pos(folio) == pos) ||
+ !folio_contains(folio, pos >> PAGE_SHIFT)))
+ return -EINVAL;
+
+ ret = mpage_submit_folio(mpd, folio);
+ if (ret)
+ goto out;
+ /*
+ * Update start_pos to prevent this folio from being released in
+ * mpage_release_unused_pages(), it will be reset to the aligned folio
+ * pos when this folio is written again in the next round. Additionally,
+ * do not update wbc->nr_to_write here, as it will be updated once the
+ * entire folio has finished processing.
+ */
+ mpd->start_pos = pos;
+out:
+ folio_unlock(folio);
+ folio_put(folio);
+ return ret;
+}
+
+/*
* mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
* mpd->len and submit pages underlying it for IO
*
@@ -2273,17 +2469,25 @@ static int mpage_map_and_submit_extent(handle_t *handle,
if (err < 0) {
struct super_block *sb = inode->i_sb;
- if (ext4_forced_shutdown(sb))
+ if (ext4_emergency_state(sb))
goto invalidate_dirty_pages;
/*
* Let the uper layers retry transient errors.
* In the case of ENOSPC, if ext4_count_free_blocks()
* is non-zero, a commit should free up blocks.
*/
- if ((err == -ENOMEM) ||
+ if ((err == -ENOMEM) || (err == -EAGAIN) ||
(err == -ENOSPC && ext4_count_free_clusters(sb))) {
- if (progress)
+ /*
+ * We may have already allocated extents for
+ * some bhs inside the folio, issue the
+ * corresponding data to prevent stale data.
+ */
+ if (progress) {
+ if (mpage_submit_partial_folio(mpd))
+ goto invalidate_dirty_pages;
goto update_disksize;
+ }
return err;
}
ext4_msg(sb, KERN_CRIT,
@@ -2317,7 +2521,7 @@ update_disksize:
* Update on-disk size after IO is submitted. Races with
* truncate are avoided by checking i_size under i_data_sem.
*/
- disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT;
+ disksize = mpd->start_pos;
if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) {
int err2;
loff_t i_size;
@@ -2341,21 +2545,6 @@ update_disksize:
return err;
}
-/*
- * Calculate the total number of credits to reserve for one writepages
- * iteration. This is called from ext4_writepages(). We map an extent of
- * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
- * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
- * bpp - 1 blocks in bpp different extents.
- */
-static int ext4_da_writepages_trans_blocks(struct inode *inode)
-{
- int bpp = ext4_journal_blocks_per_page(inode);
-
- return ext4_meta_trans_blocks(inode,
- MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
-}
-
static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio,
size_t len)
{
@@ -2386,7 +2575,7 @@ static int mpage_journal_page_buffers(handle_t *handle,
size_t len = folio_size(folio);
folio_clear_checked(folio);
- mpd->wbc->nr_to_write--;
+ mpd->wbc->nr_to_write -= folio_nr_pages(folio);
if (folio_pos(folio) + len > size &&
!ext4_verity_in_progress(inode))
@@ -2420,15 +2609,15 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
struct address_space *mapping = mpd->inode->i_mapping;
struct folio_batch fbatch;
unsigned int nr_folios;
- pgoff_t index = mpd->first_page;
- pgoff_t end = mpd->last_page;
+ pgoff_t index = mpd->start_pos >> PAGE_SHIFT;
+ pgoff_t end = mpd->end_pos >> PAGE_SHIFT;
xa_mark_t tag;
int i, err = 0;
int blkbits = mpd->inode->i_blkbits;
ext4_lblk_t lblk;
struct buffer_head *head;
handle_t *handle = NULL;
- int bpp = ext4_journal_blocks_per_page(mpd->inode);
+ int bpp = ext4_journal_blocks_per_folio(mpd->inode);
if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
@@ -2436,7 +2625,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
tag = PAGECACHE_TAG_DIRTY;
mpd->map.m_len = 0;
- mpd->next_page = index;
+ mpd->next_pos = mpd->start_pos;
if (ext4_should_journal_data(mpd->inode)) {
handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE,
bpp);
@@ -2467,7 +2656,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
goto out;
/* If we can't merge this page, we are done. */
- if (mpd->map.m_len > 0 && mpd->next_page != folio->index)
+ if (mpd->map.m_len > 0 &&
+ mpd->next_pos != folio_pos(folio))
goto out;
if (handle) {
@@ -2513,8 +2703,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
}
if (mpd->map.m_len == 0)
- mpd->first_page = folio->index;
- mpd->next_page = folio_next_index(folio);
+ mpd->start_pos = folio_pos(folio);
+ mpd->next_pos = folio_pos(folio) + folio_size(folio);
/*
* Writeout when we cannot modify metadata is simple.
* Just submit the page. For data=journal mode we
@@ -2599,10 +2789,9 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
* *never* be called, so if that ever happens, we would want
* the stack trace.
*/
- if (unlikely(ext4_forced_shutdown(mapping->host->i_sb))) {
- ret = -EROFS;
+ ret = ext4_emergency_state(mapping->host->i_sb);
+ if (unlikely(ret))
goto out_writepages;
- }
/*
* If we have inline data and arrive here, it means that
@@ -2643,12 +2832,12 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
mpd->journalled_more_data = 0;
if (ext4_should_dioread_nolock(inode)) {
+ int bpf = ext4_journal_blocks_per_folio(inode);
/*
* We may need to convert up to one extent per block in
- * the page and we may dirty the inode.
+ * the folio and we may dirty the inode.
*/
- rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
- PAGE_SIZE >> inode->i_blkbits);
+ rsv_blocks = 1 + ext4_ext_index_trans_blocks(inode, bpf);
}
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
@@ -2658,18 +2847,18 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
writeback_index = mapping->writeback_index;
if (writeback_index)
cycled = 0;
- mpd->first_page = writeback_index;
- mpd->last_page = -1;
+ mpd->start_pos = writeback_index << PAGE_SHIFT;
+ mpd->end_pos = LLONG_MAX;
} else {
- mpd->first_page = wbc->range_start >> PAGE_SHIFT;
- mpd->last_page = wbc->range_end >> PAGE_SHIFT;
+ mpd->start_pos = wbc->range_start;
+ mpd->end_pos = wbc->range_end;
}
ext4_io_submit_init(&mpd->io_submit, wbc);
retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag_pages_for_writeback(mapping, mpd->first_page,
- mpd->last_page);
+ tag_pages_for_writeback(mapping, mpd->start_pos >> PAGE_SHIFT,
+ mpd->end_pos >> PAGE_SHIFT);
blk_start_plug(&plug);
/*
@@ -2712,8 +2901,14 @@ retry:
* not supported by delalloc.
*/
BUG_ON(ext4_should_journal_data(inode));
- needed_blocks = ext4_da_writepages_trans_blocks(inode);
-
+ /*
+ * Calculate the number of credits needed to reserve for one
+ * extent of up to MAX_WRITEPAGES_EXTENT_LEN blocks. It will
+ * attempt to extend the transaction or start a new iteration
+ * if the reserved credits are insufficient.
+ */
+ needed_blocks = ext4_chunk_trans_blocks(inode,
+ MAX_WRITEPAGES_EXTENT_LEN);
/* start a new transaction */
handle = ext4_journal_start_with_reserve(inode,
EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
@@ -2729,7 +2924,8 @@ retry:
}
mpd->do_map = 1;
- trace_ext4_da_write_pages(inode, mpd->first_page, wbc);
+ trace_ext4_da_write_folios_start(inode, mpd->start_pos,
+ mpd->next_pos, wbc);
ret = mpage_prepare_extent_to_map(mpd);
if (!ret && mpd->map.m_len)
ret = mpage_map_and_submit_extent(handle, mpd,
@@ -2767,6 +2963,8 @@ retry:
} else
ext4_put_io_end(mpd->io_submit.io_end);
mpd->io_submit.io_end = NULL;
+ trace_ext4_da_write_folios_end(inode, mpd->start_pos,
+ mpd->next_pos, wbc, ret);
if (ret == -ENOSPC && sbi->s_journal) {
/*
@@ -2778,6 +2976,8 @@ retry:
ret = 0;
continue;
}
+ if (ret == -EAGAIN)
+ ret = 0;
/* Fatal error - ENOMEM, EIO... */
if (ret)
break;
@@ -2786,8 +2986,8 @@ unplug:
blk_finish_plug(&plug);
if (!ret && !cycled && wbc->nr_to_write > 0) {
cycled = 1;
- mpd->last_page = writeback_index - 1;
- mpd->first_page = 0;
+ mpd->end_pos = (writeback_index << PAGE_SHIFT) - 1;
+ mpd->start_pos = 0;
goto retry;
}
@@ -2797,7 +2997,7 @@ unplug:
* Set the writeback_index so that range_cyclic
* mode will write it back later
*/
- mapping->writeback_index = mpd->first_page;
+ mapping->writeback_index = mpd->start_pos >> PAGE_SHIFT;
out_writepages:
trace_ext4_writepages_result(inode, wbc, ret,
@@ -2817,8 +3017,9 @@ static int ext4_writepages(struct address_space *mapping,
int ret;
int alloc_ctx;
- if (unlikely(ext4_forced_shutdown(sb)))
- return -EIO;
+ ret = ext4_emergency_state(sb);
+ if (unlikely(ret))
+ return ret;
alloc_ctx = ext4_writepages_down_read(sb);
ret = ext4_do_writepages(&mpd);
@@ -2858,8 +3059,9 @@ static int ext4_dax_writepages(struct address_space *mapping,
struct inode *inode = mapping->host;
int alloc_ctx;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
alloc_ctx = ext4_writepages_down_read(inode->i_sb);
trace_ext4_writepages(inode, wbc);
@@ -2906,7 +3108,8 @@ static int ext4_nonda_switch(struct super_block *sb)
return 0;
}
-static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+static int ext4_da_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
@@ -2915,22 +3118,23 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index;
struct inode *inode = mapping->host;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
index = pos >> PAGE_SHIFT;
if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) {
*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
- return ext4_write_begin(file, mapping, pos,
+ return ext4_write_begin(iocb, mapping, pos,
len, foliop, fsdata);
}
*fsdata = (void *)0;
trace_ext4_da_write_begin(inode, pos, len);
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
- ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len,
- foliop, fsdata);
+ ret = ext4_generic_write_inline_data(mapping, inode, pos, len,
+ foliop, fsdata, true);
if (ret < 0)
return ret;
if (ret == 1)
@@ -2938,18 +3142,20 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
}
retry:
- folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
- mapping_gfp_mask(mapping));
+ folio = write_begin_get_folio(iocb, mapping, index, len);
if (IS_ERR(folio))
return PTR_ERR(folio);
+ if (pos + len > folio_pos(folio) + folio_size(folio))
+ len = folio_pos(folio) + folio_size(folio) - pos;
+
ret = ext4_block_write_begin(NULL, folio, pos, len,
ext4_da_get_block_prep);
if (ret < 0) {
folio_unlock(folio);
folio_put(folio);
/*
- * block_write_begin may have instantiated a few blocks
+ * ext4_block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
* i_size_read because we hold inode lock.
*/
@@ -3008,8 +3214,7 @@ static int ext4_da_do_write_end(struct address_space *mapping,
* block_write_end() will mark the inode as dirty with I_DIRTY_PAGES
* flag, which all that's needed to trigger page writeback.
*/
- copied = block_write_end(NULL, mapping, pos, len, copied,
- folio, NULL);
+ copied = block_write_end(pos, len, copied, folio);
new_i_size = pos + copied;
/*
@@ -3031,7 +3236,7 @@ static int ext4_da_do_write_end(struct address_space *mapping,
unsigned long end;
i_size_write(inode, new_i_size);
- end = (new_i_size - 1) & (PAGE_SIZE - 1);
+ end = offset_in_folio(folio, new_i_size - 1);
if (copied && ext4_da_should_update_i_disksize(folio, end)) {
ext4_update_i_disksize(inode, new_i_size);
disksize_changed = true;
@@ -3060,7 +3265,7 @@ static int ext4_da_do_write_end(struct address_space *mapping,
return copied;
}
-static int ext4_da_write_end(struct file *file,
+static int ext4_da_write_end(const struct kiocb *iocb,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
@@ -3069,7 +3274,7 @@ static int ext4_da_write_end(struct file *file,
int write_mode = (int)(unsigned long)fsdata;
if (write_mode == FALL_BACK_TO_NONDELALLOC)
- return ext4_write_end(file, mapping, pos,
+ return ext4_write_end(iocb, mapping, pos,
len, copied, folio, fsdata);
trace_ext4_da_write_end(inode, pos, len, copied);
@@ -3290,6 +3495,10 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
if (map->m_flags & EXT4_MAP_NEW)
iomap->flags |= IOMAP_F_NEW;
+ /* HW-offload atomics are always used */
+ if (flags & IOMAP_ATOMIC)
+ iomap->flags |= IOMAP_F_ATOMIC_BIO;
+
if (flags & IOMAP_DAX)
iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
else
@@ -3329,12 +3538,149 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
}
}
+static int ext4_map_blocks_atomic_write_slow(handle_t *handle,
+ struct inode *inode, struct ext4_map_blocks *map)
+{
+ ext4_lblk_t m_lblk = map->m_lblk;
+ unsigned int m_len = map->m_len;
+ unsigned int mapped_len = 0, m_flags = 0;
+ ext4_fsblk_t next_pblk;
+ bool check_next_pblk = false;
+ int ret = 0;
+
+ WARN_ON_ONCE(!ext4_has_feature_bigalloc(inode->i_sb));
+
+ /*
+ * This is a slow path in case of mixed mapping. We use
+ * EXT4_GET_BLOCKS_CREATE_ZERO flag here to make sure we get a single
+ * contiguous mapped mapping. This will ensure any unwritten or hole
+ * regions within the requested range is zeroed out and we return
+ * a single contiguous mapped extent.
+ */
+ m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
+
+ do {
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
+ if (ret < 0 && ret != -ENOSPC)
+ goto out_err;
+ /*
+ * This should never happen, but let's return an error code to
+ * avoid an infinite loop in here.
+ */
+ if (ret == 0) {
+ ret = -EFSCORRUPTED;
+ ext4_warning_inode(inode,
+ "ext4_map_blocks() couldn't allocate blocks m_flags: 0x%x, ret:%d",
+ m_flags, ret);
+ goto out_err;
+ }
+ /*
+ * With bigalloc we should never get ENOSPC nor discontiguous
+ * physical extents.
+ */
+ if ((check_next_pblk && next_pblk != map->m_pblk) ||
+ ret == -ENOSPC) {
+ ext4_warning_inode(inode,
+ "Non-contiguous allocation detected: expected %llu, got %llu, "
+ "or ext4_map_blocks() returned out of space ret: %d",
+ next_pblk, map->m_pblk, ret);
+ ret = -EFSCORRUPTED;
+ goto out_err;
+ }
+ next_pblk = map->m_pblk + map->m_len;
+ check_next_pblk = true;
+
+ mapped_len += map->m_len;
+ map->m_lblk += map->m_len;
+ map->m_len = m_len - mapped_len;
+ } while (mapped_len < m_len);
+
+ /*
+ * We might have done some work in above loop, so we need to query the
+ * start of the physical extent, based on the origin m_lblk and m_len.
+ * Let's also ensure we were able to allocate the required range for
+ * mixed mapping case.
+ */
+ map->m_lblk = m_lblk;
+ map->m_len = m_len;
+ map->m_flags = 0;
+
+ ret = ext4_map_blocks(handle, inode, map,
+ EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF);
+ if (ret != m_len) {
+ ext4_warning_inode(inode,
+ "allocation failed for atomic write request m_lblk:%u, m_len:%u, ret:%d\n",
+ m_lblk, m_len, ret);
+ ret = -EINVAL;
+ }
+ return ret;
+
+out_err:
+ /* reset map before returning an error */
+ map->m_lblk = m_lblk;
+ map->m_len = m_len;
+ map->m_flags = 0;
+ return ret;
+}
+
+/*
+ * ext4_map_blocks_atomic: Helper routine to ensure the entire requested
+ * range in @map [lblk, lblk + len) is one single contiguous extent with no
+ * mixed mappings.
+ *
+ * We first use m_flags passed to us by our caller (ext4_iomap_alloc()).
+ * We only call EXT4_GET_BLOCKS_ZERO in the slow path, when the underlying
+ * physical extent for the requested range does not have a single contiguous
+ * mapping type i.e. (Hole, Mapped, or Unwritten) throughout.
+ * In that case we will loop over the requested range to allocate and zero out
+ * the unwritten / holes in between, to get a single mapped extent from
+ * [m_lblk, m_lblk + m_len). Note that this is only possible because we know
+ * this can be called only with bigalloc enabled filesystem where the underlying
+ * cluster is already allocated. This avoids allocating discontiguous extents
+ * in the slow path due to multiple calls to ext4_map_blocks().
+ * The slow path is mostly non-performance critical path, so it should be ok to
+ * loop using ext4_map_blocks() with appropriate flags to allocate & zero the
+ * underlying short holes/unwritten extents within the requested range.
+ */
+static int ext4_map_blocks_atomic_write(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int m_flags,
+ bool *force_commit)
+{
+ ext4_lblk_t m_lblk = map->m_lblk;
+ unsigned int m_len = map->m_len;
+ int ret = 0;
+
+ WARN_ON_ONCE(m_len > 1 && !ext4_has_feature_bigalloc(inode->i_sb));
+
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
+ if (ret < 0 || ret == m_len)
+ goto out;
+ /*
+ * This is a mixed mapping case where we were not able to allocate
+ * a single contiguous extent. In that case let's reset requested
+ * mapping and call the slow path.
+ */
+ map->m_lblk = m_lblk;
+ map->m_len = m_len;
+ map->m_flags = 0;
+
+ /*
+ * slow path means we have mixed mapping, that means we will need
+ * to force txn commit.
+ */
+ *force_commit = true;
+ return ext4_map_blocks_atomic_write_slow(handle, inode, map);
+out:
+ return ret;
+}
+
static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
unsigned int flags)
{
handle_t *handle;
u8 blkbits = inode->i_blkbits;
int ret, dio_credits, m_flags = 0, retries = 0;
+ bool force_commit = false;
/*
* Trim the mapping request to the maximum value that we can map at
@@ -3342,7 +3688,30 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
*/
if (map->m_len > DIO_MAX_BLOCKS)
map->m_len = DIO_MAX_BLOCKS;
- dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
+
+ /*
+ * journal credits estimation for atomic writes. We call
+ * ext4_map_blocks(), to find if there could be a mixed mapping. If yes,
+ * then let's assume the no. of pextents required can be m_len i.e.
+ * every alternate block can be unwritten and hole.
+ */
+ if (flags & IOMAP_ATOMIC) {
+ unsigned int orig_mlen = map->m_len;
+
+ ret = ext4_map_blocks(NULL, inode, map, 0);
+ if (ret < 0)
+ return ret;
+ if (map->m_len < orig_mlen) {
+ map->m_len = orig_mlen;
+ dio_credits = ext4_meta_trans_blocks(inode, orig_mlen,
+ map->m_len);
+ } else {
+ dio_credits = ext4_chunk_trans_blocks(inode,
+ map->m_len);
+ }
+ } else {
+ dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
+ }
retry:
/*
@@ -3373,7 +3742,11 @@ retry:
else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
- ret = ext4_map_blocks(handle, inode, map, m_flags);
+ if (flags & IOMAP_ATOMIC)
+ ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags,
+ &force_commit);
+ else
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
/*
* We cannot fill holes in indirect tree based inodes as that could
@@ -3387,6 +3760,22 @@ retry:
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
+ /*
+ * Force commit the current transaction if the allocation spans a mixed
+ * mapping range. This ensures any pending metadata updates (like
+ * unwritten to written extents conversion) in this range are in
+ * consistent state with the file data blocks, before performing the
+ * actual write I/O. If the commit fails, the whole I/O must be aborted
+ * to prevent any possible torn writes.
+ */
+ if (ret > 0 && force_commit) {
+ int ret2;
+
+ ret2 = ext4_force_commit(inode->i_sb);
+ if (ret2)
+ return ret2;
+ }
+
return ret;
}
@@ -3397,6 +3786,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
int ret;
struct ext4_map_blocks map;
u8 blkbits = inode->i_blkbits;
+ unsigned int orig_mlen;
if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
return -EINVAL;
@@ -3410,6 +3800,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
map.m_lblk = offset >> blkbits;
map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
+ orig_mlen = map.m_len;
if (flags & IOMAP_WRITE) {
/*
@@ -3420,11 +3811,23 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
*/
if (offset + length <= i_size_read(inode)) {
ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED))
- goto out;
+ /*
+ * For atomic writes the entire requested length should
+ * be mapped.
+ */
+ if (map.m_flags & EXT4_MAP_MAPPED) {
+ if ((!(flags & IOMAP_ATOMIC) && ret > 0) ||
+ (flags & IOMAP_ATOMIC && ret >= orig_mlen))
+ goto out;
+ }
+ map.m_len = orig_mlen;
}
ret = ext4_iomap_alloc(inode, &map, flags);
} else {
+ /*
+ * This can be called for overwrites path from
+ * ext4_iomap_overwrite_begin().
+ */
ret = ext4_map_blocks(NULL, inode, &map, 0);
}
@@ -3438,6 +3841,16 @@ out:
*/
map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
+ /*
+ * Before returning to iomap, let's ensure the allocated mapping
+ * covers the entire requested length for atomic writes.
+ */
+ if (flags & IOMAP_ATOMIC) {
+ if (map.m_len < (length >> blkbits)) {
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+ }
ext4_set_iomap(inode, iomap, &map, offset, length, flags);
return 0;
@@ -3679,9 +4092,7 @@ void ext4_set_aops(struct inode *inode)
static int __ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{
- ext4_fsblk_t index = from >> PAGE_SHIFT;
- unsigned offset = from & (PAGE_SIZE-1);
- unsigned blocksize, pos;
+ unsigned int offset, blocksize, pos;
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
struct buffer_head *bh;
@@ -3696,13 +4107,14 @@ static int __ext4_block_zero_page_range(handle_t *handle,
blocksize = inode->i_sb->s_blocksize;
- iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
+ iblock = folio->index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
bh = folio_buffers(folio);
if (!bh)
bh = create_empty_buffers(folio, blocksize, 0);
/* Find the buffer that contains "offset" */
+ offset = offset_in_folio(folio, from);
pos = blocksize;
while (offset >= pos) {
bh = bh->b_this_page;
@@ -3902,6 +4314,68 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
return ret;
}
+static inline void ext4_truncate_folio(struct inode *inode,
+ loff_t start, loff_t end)
+{
+ unsigned long blocksize = i_blocksize(inode);
+ struct folio *folio;
+
+ /* Nothing to be done if no complete block needs to be truncated. */
+ if (round_up(start, blocksize) >= round_down(end, blocksize))
+ return;
+
+ folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT);
+ if (IS_ERR(folio))
+ return;
+
+ if (folio_mkclean(folio))
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+}
+
+int ext4_truncate_page_cache_block_range(struct inode *inode,
+ loff_t start, loff_t end)
+{
+ unsigned long blocksize = i_blocksize(inode);
+ int ret;
+
+ /*
+ * For journalled data we need to write (and checkpoint) pages
+ * before discarding page cache to avoid inconsitent data on disk
+ * in case of crash before freeing or unwritten converting trans
+ * is committed.
+ */
+ if (ext4_should_journal_data(inode)) {
+ ret = filemap_write_and_wait_range(inode->i_mapping, start,
+ end - 1);
+ if (ret)
+ return ret;
+ goto truncate_pagecache;
+ }
+
+ /*
+ * If the block size is less than the page size, the file's mapped
+ * blocks within one page could be freed or converted to unwritten.
+ * So it's necessary to remove writable userspace mappings, and then
+ * ext4_page_mkwrite() can be called during subsequent write access
+ * to these partial folios.
+ */
+ if (!IS_ALIGNED(start | end, PAGE_SIZE) &&
+ blocksize < PAGE_SIZE && start < inode->i_size) {
+ loff_t page_boundary = round_up(start, PAGE_SIZE);
+
+ ext4_truncate_folio(inode, start, min(page_boundary, end));
+ if (end > page_boundary)
+ ext4_truncate_folio(inode,
+ round_down(end, PAGE_SIZE), end);
+ }
+
+truncate_pagecache:
+ truncate_pagecache_range(inode, start, end - 1);
+ return 0;
+}
+
static void ext4_wait_dax_page(struct inode *inode)
{
filemap_invalidate_unlock(inode->i_mapping);
@@ -3911,24 +4385,10 @@ static void ext4_wait_dax_page(struct inode *inode)
int ext4_break_layouts(struct inode *inode)
{
- struct page *page;
- int error;
-
if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)))
return -EINVAL;
- do {
- page = dax_layout_busy_page(inode->i_mapping);
- if (!page)
- return 0;
-
- error = ___wait_var_event(&page->_refcount,
- atomic_read(&page->_refcount) == 1,
- TASK_INTERRUPTIBLE, 0, 0,
- ext4_wait_dax_page(inode));
- } while (error == 0);
-
- return error;
+ return dax_break_layout_inode(inode, ext4_wait_dax_page);
}
/*
@@ -3946,148 +4406,112 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- ext4_lblk_t first_block, stop_block;
- struct address_space *mapping = inode->i_mapping;
- loff_t first_block_offset, last_block_offset, max_length;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ext4_lblk_t start_lblk, end_lblk;
+ loff_t max_end = sb->s_maxbytes;
+ loff_t end = offset + length;
handle_t *handle;
unsigned int credits;
- int ret = 0, ret2 = 0;
+ int ret;
trace_ext4_punch_hole(inode, offset, length, 0);
+ WARN_ON_ONCE(!inode_is_locked(inode));
/*
- * Write out all dirty pages to avoid race conditions
- * Then release them.
+ * For indirect-block based inodes, make sure that the hole within
+ * one block before last range.
*/
- if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
- ret = filemap_write_and_wait_range(mapping, offset,
- offset + length - 1);
- if (ret)
- return ret;
- }
-
- inode_lock(inode);
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize;
/* No need to punch hole beyond i_size */
- if (offset >= inode->i_size)
- goto out_mutex;
+ if (offset >= inode->i_size || offset >= max_end)
+ return 0;
/*
- * If the hole extends beyond i_size, set the hole
- * to end after the page that contains i_size
+ * If the hole extends beyond i_size, set the hole to end after
+ * the page that contains i_size.
*/
- if (offset + length > inode->i_size) {
- length = inode->i_size +
- PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) -
- offset;
- }
+ if (end > inode->i_size)
+ end = round_up(inode->i_size, PAGE_SIZE);
+ if (end > max_end)
+ end = max_end;
+ length = end - offset;
/*
- * For punch hole the length + offset needs to be within one block
- * before last range. Adjust the length if it goes beyond that limit.
+ * Attach jinode to inode for jbd2 if we do any zeroing of partial
+ * block.
*/
- max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
- if (offset + length > max_length)
- length = max_length - offset;
-
- if (offset & (sb->s_blocksize - 1) ||
- (offset + length) & (sb->s_blocksize - 1)) {
- /*
- * Attach jinode to inode for jbd2 if we do any zeroing of
- * partial block
- */
+ if (!IS_ALIGNED(offset | end, sb->s_blocksize)) {
ret = ext4_inode_attach_jinode(inode);
if (ret < 0)
- goto out_mutex;
-
+ return ret;
}
- /* Wait all existing dio workers, newcomers will block on i_rwsem */
- inode_dio_wait(inode);
- ret = file_modified(file);
+ ret = ext4_update_disksize_before_punch(inode, offset, length);
if (ret)
- goto out_mutex;
-
- /*
- * Prevent page faults from reinstantiating pages we have released from
- * page cache.
- */
- filemap_invalidate_lock(mapping);
-
- ret = ext4_break_layouts(inode);
- if (ret)
- goto out_dio;
-
- first_block_offset = round_up(offset, sb->s_blocksize);
- last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
+ return ret;
/* Now release the pages and zero block aligned part of pages*/
- if (last_block_offset > first_block_offset) {
- ret = ext4_update_disksize_before_punch(inode, offset, length);
- if (ret)
- goto out_dio;
- truncate_pagecache_range(inode, first_block_offset,
- last_block_offset);
- }
+ ret = ext4_truncate_page_cache_block_range(inode, offset, end);
+ if (ret)
+ return ret;
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- credits = ext4_writepage_trans_blocks(inode);
+ credits = ext4_chunk_trans_extent(inode, 2);
else
credits = ext4_blocks_for_truncate(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_std_error(sb, ret);
- goto out_dio;
+ return ret;
}
- ret = ext4_zero_partial_blocks(handle, inode, offset,
- length);
+ ret = ext4_zero_partial_blocks(handle, inode, offset, length);
if (ret)
- goto out_stop;
-
- first_block = (offset + sb->s_blocksize - 1) >>
- EXT4_BLOCK_SIZE_BITS(sb);
- stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+ goto out_handle;
/* If there are blocks to remove, do it */
- if (stop_block > first_block) {
- ext4_lblk_t hole_len = stop_block - first_block;
+ start_lblk = EXT4_B_TO_LBLK(inode, offset);
+ end_lblk = end >> inode->i_blkbits;
+
+ if (end_lblk > start_lblk) {
+ ext4_lblk_t hole_len = end_lblk - start_lblk;
+ ext4_fc_track_inode(handle, inode);
+ ext4_check_map_extents_env(inode);
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
- ext4_es_remove_extent(inode, first_block, hole_len);
+ ext4_es_remove_extent(inode, start_lblk, hole_len);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- ret = ext4_ext_remove_space(inode, first_block,
- stop_block - 1);
+ ret = ext4_ext_remove_space(inode, start_lblk,
+ end_lblk - 1);
else
- ret = ext4_ind_remove_space(handle, inode, first_block,
- stop_block);
+ ret = ext4_ind_remove_space(handle, inode, start_lblk,
+ end_lblk);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_handle;
+ }
- ext4_es_insert_extent(inode, first_block, hole_len, ~0,
+ ext4_es_insert_extent(inode, start_lblk, hole_len, ~0,
EXTENT_STATUS_HOLE, 0);
up_write(&EXT4_I(inode)->i_data_sem);
}
- ext4_fc_track_range(handle, inode, first_block, stop_block);
+ ext4_fc_track_range(handle, inode, start_lblk, end_lblk);
+
+ ret = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(ret))
+ goto out_handle;
+
+ ext4_update_inode_fsync_trans(handle, inode, 1);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
-
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
- ret2 = ext4_mark_inode_dirty(handle, inode);
- if (unlikely(ret2))
- ret = ret2;
- if (ret >= 0)
- ext4_update_inode_fsync_trans(handle, inode, 1);
-out_stop:
+out_handle:
ext4_journal_stop(handle);
-out_dio:
- filemap_invalidate_unlock(mapping);
-out_mutex:
- inode_unlock(inode);
return ret;
}
@@ -4183,7 +4607,7 @@ int ext4_truncate(struct inode *inode)
}
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- credits = ext4_writepage_trans_blocks(inode);
+ credits = ext4_chunk_trans_extent(inode, 1);
else
credits = ext4_blocks_for_truncate(inode);
@@ -4209,8 +4633,10 @@ int ext4_truncate(struct inode *inode)
if (err)
goto out_stop;
- down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_fc_track_inode(handle, inode);
+ ext4_check_map_extents_env(inode);
+ down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -4674,6 +5100,11 @@ static inline int ext4_iget_extra_inode(struct inode *inode,
*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
int err;
+ err = xattr_check_inode(inode, IHDR(inode, raw_inode),
+ ITAIL(inode, raw_inode));
+ if (err)
+ return err;
+
ext4_set_inode_state(inode, EXT4_STATE_XATTR);
err = ext4_find_inline_data_nolock(inode);
if (!err && ext4_has_inline_data(inode))
@@ -4705,22 +5136,76 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val)
inode_set_iversion_queried(inode, val);
}
-static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags)
-
+static int check_igot_inode(struct inode *inode, ext4_iget_flags flags,
+ const char *function, unsigned int line)
{
+ const char *err_str;
+
if (flags & EXT4_IGET_EA_INODE) {
- if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
- return "missing EA_INODE flag";
+ if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+ err_str = "missing EA_INODE flag";
+ goto error;
+ }
if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
- EXT4_I(inode)->i_file_acl)
- return "ea_inode with extended attributes";
+ EXT4_I(inode)->i_file_acl) {
+ err_str = "ea_inode with extended attributes";
+ goto error;
+ }
} else {
- if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
- return "unexpected EA_INODE flag";
+ if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+ /*
+ * open_by_handle_at() could provide an old inode number
+ * that has since been reused for an ea_inode; this does
+ * not indicate filesystem corruption
+ */
+ if (flags & EXT4_IGET_HANDLE)
+ return -ESTALE;
+ err_str = "unexpected EA_INODE flag";
+ goto error;
+ }
+ }
+ if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) {
+ err_str = "unexpected bad inode w/o EXT4_IGET_BAD";
+ goto error;
}
- if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD))
- return "unexpected bad inode w/o EXT4_IGET_BAD";
- return NULL;
+ return 0;
+
+error:
+ ext4_error_inode(inode, function, line, 0, "%s", err_str);
+ return -EFSCORRUPTED;
+}
+
+static bool ext4_should_enable_large_folio(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (!S_ISREG(inode->i_mode))
+ return false;
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+ ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
+ return false;
+ if (ext4_has_feature_verity(sb))
+ return false;
+ if (ext4_has_feature_encrypt(sb))
+ return false;
+
+ return true;
+}
+
+/*
+ * Limit the maximum folio order to 2048 blocks to prevent overestimation
+ * of reserve handle credits during the folio writeback in environments
+ * where the PAGE_SIZE exceeds 4KB.
+ */
+#define EXT4_MAX_PAGECACHE_ORDER(i) \
+ umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT))
+void ext4_set_inode_mapping_order(struct inode *inode)
+{
+ if (!ext4_should_enable_large_folio(inode))
+ return;
+
+ mapping_set_folio_order_range(inode->i_mapping, 0,
+ EXT4_MAX_PAGECACHE_ORDER(inode));
}
struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
@@ -4732,7 +5217,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
struct ext4_inode_info *ei;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct inode *inode;
- const char *err_str;
journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
loff_t size;
@@ -4741,12 +5225,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
gid_t i_gid;
projid_t i_projid;
- if ((!(flags & EXT4_IGET_SPECIAL) &&
- ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) ||
- ino == le32_to_cpu(es->s_usr_quota_inum) ||
- ino == le32_to_cpu(es->s_grp_quota_inum) ||
- ino == le32_to_cpu(es->s_prj_quota_inum) ||
- ino == le32_to_cpu(es->s_orphan_file_inum))) ||
+ if ((!(flags & EXT4_IGET_SPECIAL) && is_special_ino(sb, ino)) ||
(ino < EXT4_ROOT_INO) ||
(ino > le32_to_cpu(es->s_inodes_count))) {
if (flags & EXT4_IGET_HANDLE)
@@ -4761,10 +5240,10 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
if (!inode)
return ERR_PTR(-ENOMEM);
if (!(inode->i_state & I_NEW)) {
- if ((err_str = check_igot_inode(inode, flags)) != NULL) {
- ext4_error_inode(inode, function, line, 0, err_str);
+ ret = check_igot_inode(inode, flags, function, line);
+ if (ret) {
iput(inode);
- return ERR_PTR(-EFSCORRUPTED);
+ return ERR_PTR(ret);
}
return inode;
}
@@ -4800,15 +5279,14 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ei->i_extra_isize = 0;
/* Precompute checksum seed for inode metadata */
- if (ext4_has_metadata_csum(sb)) {
+ if (ext4_has_feature_metadata_csum(sb)) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum;
__le32 inum = cpu_to_le32(inode->i_ino);
__le32 gen = raw_inode->i_generation;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum,
sizeof(inum));
- ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
- sizeof(gen));
+ ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen));
}
if ((!ext4_inode_csum_verify(inode, raw_inode, ei) ||
@@ -4876,7 +5354,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ei->i_file_acl |=
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
inode->i_size = ext4_isize(sb, raw_inode);
- if ((size = i_size_read(inode)) < 0) {
+ size = i_size_read(inode);
+ if (size < 0 || size > ext4_get_maxbytes(inode)) {
ext4_error_inode(inode, function, line, 0,
"iget: bad i_size value: %lld", size);
ret = -EFSCORRUPTED;
@@ -4887,7 +5366,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
* we'd normally treat htree data as empty space. But with metadata
* checksumming that corrupts checksums so forbid that.
*/
- if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) &&
+ if (!ext4_has_feature_dir_index(sb) &&
+ ext4_has_feature_metadata_csum(sb) &&
ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
ext4_error_inode(inode, function, line, 0,
"iget: Dir with htree data on filesystem without dir_index feature.");
@@ -5007,8 +5487,16 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
inode->i_op = &ext4_encrypted_symlink_inode_operations;
} else if (ext4_inode_is_fast_symlink(inode)) {
inode->i_op = &ext4_fast_symlink_inode_operations;
- nd_terminate_link(ei->i_data, inode->i_size,
- sizeof(ei->i_data) - 1);
+ if (inode->i_size == 0 ||
+ inode->i_size >= sizeof(ei->i_data) ||
+ strnlen((char *)ei->i_data, inode->i_size + 1) !=
+ inode->i_size) {
+ ext4_error_inode(inode, function, line, 0,
+ "invalid fast symlink length %llu",
+ (unsigned long long)inode->i_size);
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
inode_set_cached_link(inode, (char *)ei->i_data,
inode->i_size);
} else {
@@ -5037,13 +5525,24 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ret = -EFSCORRUPTED;
goto bad_inode;
}
- if ((err_str = check_igot_inode(inode, flags)) != NULL) {
- ext4_error_inode(inode, function, line, 0, err_str);
- ret = -EFSCORRUPTED;
- goto bad_inode;
- }
+ ext4_set_inode_mapping_order(inode);
+
+ ret = check_igot_inode(inode, flags, function, line);
+ /*
+ * -ESTALE here means there is nothing inherently wrong with the inode,
+ * it's just not an inode we can return for an fhandle lookup.
+ */
+ if (ret == -ESTALE) {
+ brelse(iloc.bh);
+ unlock_new_inode(inode);
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
+ if (ret)
+ goto bad_inode;
brelse(iloc.bh);
+
unlock_new_inode(inode);
return inode;
@@ -5228,8 +5727,9 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
return 0;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ err = ext4_emergency_state(inode->i_sb);
+ if (unlikely(err))
+ return err;
if (EXT4_SB(inode->i_sb)->s_journal) {
if (ext4_journal_current_handle()) {
@@ -5351,8 +5851,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
const unsigned int ia_valid = attr->ia_valid;
bool inc_ivers = true;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ error = ext4_emergency_state(inode->i_sb);
+ if (unlikely(error))
+ return error;
if (unlikely(IS_IMMUTABLE(inode)))
return -EPERM;
@@ -5464,7 +5965,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
oldsize & (inode->i_sb->s_blocksize - 1)) {
error = ext4_inode_attach_jinode(inode);
if (error)
- goto err_out;
+ goto out_mmap_sem;
}
handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
@@ -5505,9 +6006,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
down_write(&EXT4_I(inode)->i_data_sem);
old_disksize = EXT4_I(inode)->i_disksize;
EXT4_I(inode)->i_disksize = attr->ia_size;
- rc = ext4_mark_inode_dirty(handle, inode);
- if (!error)
- error = rc;
+
/*
* We have to update i_size under i_data_sem together
* with i_disksize to avoid races with writeback code
@@ -5518,6 +6017,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
else
EXT4_I(inode)->i_disksize = old_disksize;
up_write(&EXT4_I(inode)->i_data_sem);
+ rc = ext4_mark_inode_dirty(handle, inode);
+ if (!error)
+ error = rc;
ext4_journal_stop(handle);
if (error)
goto out_mmap_sem;
@@ -5633,7 +6135,7 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
awu_max = sbi->s_awu_max;
}
- generic_fill_statx_atomic_writes(stat, awu_min, awu_max);
+ generic_fill_statx_atomic_writes(stat, awu_min, awu_max, 0);
}
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
@@ -5714,8 +6216,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
*
* Also account for superblock, inode, quota and xattr blocks
*/
-static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
- int pextents)
+int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents)
{
ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
int gdpblocks;
@@ -5723,13 +6224,11 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
int ret;
/*
- * How many index blocks need to touch to map @lblocks logical blocks
- * to @pextents physical extents?
+ * How many index and leaf blocks need to touch to map @lblocks
+ * logical blocks to @pextents physical extents?
*/
idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
- ret = idxblocks;
-
/*
* Now let's see how many group bitmaps and group descriptors need
* to account
@@ -5742,7 +6241,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
/* bitmaps and block group descriptor blocks */
- ret += groups + gdpblocks;
+ ret = idxblocks + groups + gdpblocks;
/* Blocks for super block, inode, quota and xattr blocks */
ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
@@ -5751,25 +6250,19 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
}
/*
- * Calculate the total number of credits to reserve to fit
- * the modification of a single pages into a single transaction,
- * which may include multiple chunks of block allocations.
- *
- * This could be called via ext4_write_begin()
- *
- * We need to consider the worse case, when
- * one new block per extent.
+ * Calculate the journal credits for modifying the number of blocks
+ * in a single extent within one transaction. 'nrblocks' is used only
+ * for non-extent inodes. For extent type inodes, 'nrblocks' can be
+ * zero if the exact number of blocks is unknown.
*/
-int ext4_writepage_trans_blocks(struct inode *inode)
+int ext4_chunk_trans_extent(struct inode *inode, int nrblocks)
{
- int bpp = ext4_journal_blocks_per_page(inode);
int ret;
- ret = ext4_meta_trans_blocks(inode, bpp, bpp);
-
+ ret = ext4_meta_trans_blocks(inode, nrblocks, 1);
/* Account for data blocks for journalled mode */
if (ext4_should_journal_data(inode))
- ret += bpp;
+ ret += nrblocks;
return ret;
}
@@ -5796,9 +6289,10 @@ int ext4_mark_iloc_dirty(handle_t *handle,
{
int err = 0;
- if (unlikely(ext4_forced_shutdown(inode->i_sb))) {
+ err = ext4_emergency_state(inode->i_sb);
+ if (unlikely(err)) {
put_bh(iloc->bh);
- return -EIO;
+ return err;
}
ext4_fc_track_inode(handle, inode);
@@ -5822,8 +6316,9 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
{
int err;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ err = ext4_emergency_state(inode->i_sb);
+ if (unlikely(err))
+ return err;
err = ext4_get_inode_loc(inode, iloc);
if (!err) {
@@ -5834,6 +6329,7 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
brelse(iloc->bh);
iloc->bh = NULL;
}
+ ext4_fc_track_inode(handle, inode);
}
ext4_std_error(inode->i_sb, err);
return err;
@@ -6138,6 +6634,55 @@ static int ext4_bh_unmapped(handle_t *handle, struct inode *inode,
return !buffer_mapped(bh);
}
+static int ext4_block_page_mkwrite(struct inode *inode, struct folio *folio,
+ get_block_t get_block)
+{
+ handle_t *handle;
+ loff_t size;
+ unsigned long len;
+ int credits;
+ int ret;
+
+ credits = ext4_chunk_trans_extent(inode,
+ ext4_journal_blocks_per_folio(inode));
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ folio_lock(folio);
+ size = i_size_read(inode);
+ /* Page got truncated from under us? */
+ if (folio->mapping != inode->i_mapping || folio_pos(folio) > size) {
+ ret = -EFAULT;
+ goto out_error;
+ }
+
+ len = folio_size(folio);
+ if (folio_pos(folio) + len > size)
+ len = size - folio_pos(folio);
+
+ ret = ext4_block_write_begin(handle, folio, 0, len, get_block);
+ if (ret)
+ goto out_error;
+
+ if (!ext4_should_journal_data(inode)) {
+ block_commit_write(folio, 0, len);
+ folio_mark_dirty(folio);
+ } else {
+ ret = ext4_journal_folio_buffers(handle, folio, len);
+ if (ret)
+ goto out_error;
+ }
+ ext4_journal_stop(handle);
+ folio_wait_stable(folio);
+ return ret;
+
+out_error:
+ folio_unlock(folio);
+ ext4_journal_stop(handle);
+ return ret;
+}
+
vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
@@ -6149,8 +6694,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
struct file *file = vma->vm_file;
struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
- handle_t *handle;
- get_block_t *get_block;
+ get_block_t *get_block = ext4_get_block;
int retries = 0;
if (unlikely(IS_IMMUTABLE(inode)))
@@ -6218,47 +6762,11 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
/* OK, we need to fill the hole... */
if (ext4_should_dioread_nolock(inode))
get_block = ext4_get_block_unwritten;
- else
- get_block = ext4_get_block;
retry_alloc:
- handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
- ext4_writepage_trans_blocks(inode));
- if (IS_ERR(handle)) {
- ret = VM_FAULT_SIGBUS;
- goto out;
- }
- /*
- * Data journalling can't use block_page_mkwrite() because it
- * will set_buffer_dirty() before do_journal_get_write_access()
- * thus might hit warning messages for dirty metadata buffers.
- */
- if (!ext4_should_journal_data(inode)) {
- err = block_page_mkwrite(vma, vmf, get_block);
- } else {
- folio_lock(folio);
- size = i_size_read(inode);
- /* Page got truncated from under us? */
- if (folio->mapping != mapping || folio_pos(folio) > size) {
- ret = VM_FAULT_NOPAGE;
- goto out_error;
- }
-
- len = folio_size(folio);
- if (folio_pos(folio) + len > size)
- len = size - folio_pos(folio);
-
- err = ext4_block_write_begin(handle, folio, 0, len,
- ext4_get_block);
- if (!err) {
- ret = VM_FAULT_SIGBUS;
- if (ext4_journal_folio_buffers(handle, folio, len))
- goto out_error;
- } else {
- folio_unlock(folio);
- }
- }
- ext4_journal_stop(handle);
- if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ /* Start journal and allocate blocks */
+ err = ext4_block_page_mkwrite(inode, folio, get_block);
+ if (err == -EAGAIN ||
+ (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)))
goto retry_alloc;
out_ret:
ret = vmf_fs_error(err);
@@ -6266,8 +6774,4 @@ out:
filemap_invalidate_unlock_shared(mapping);
sb_end_pagefault(inode->i_sb);
return ret;
-out_error:
- folio_unlock(folio);
- ext4_journal_stop(handle);
- goto out;
}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7b9ce71c1c81..84e3c73952d7 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -142,16 +142,16 @@ static int ext4_update_backup_sb(struct super_block *sb,
es = (struct ext4_super_block *) (bh->b_data + offset);
lock_buffer(bh);
- if (ext4_has_metadata_csum(sb) &&
- es->s_checksum != ext4_superblock_csum(sb, es)) {
+ if (ext4_has_feature_metadata_csum(sb) &&
+ es->s_checksum != ext4_superblock_csum(es)) {
ext4_msg(sb, KERN_ERR, "Invalid checksum for backup "
"superblock %llu", sb_block);
unlock_buffer(bh);
goto out_bh;
}
func(es, arg);
- if (ext4_has_metadata_csum(sb))
- es->s_checksum = ext4_superblock_csum(sb, es);
+ if (ext4_has_feature_metadata_csum(sb))
+ es->s_checksum = ext4_superblock_csum(es);
set_buffer_uptodate(bh);
unlock_buffer(bh);
@@ -351,11 +351,11 @@ void ext4_reset_inode_seed(struct inode *inode)
__le32 gen = cpu_to_le32(inode->i_generation);
__u32 csum;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum));
- ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen));
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum));
+ ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen));
}
/*
@@ -980,7 +980,7 @@ group_add_out:
return err;
}
-int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -997,7 +997,7 @@ int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
int ext4_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
u32 flags = fa->flags;
@@ -1205,7 +1205,8 @@ static int ext4_ioctl_setuuid(struct file *filp,
* If any checksums (group descriptors or metadata) are being used
* then the checksum seed feature is required to change the UUID.
*/
- if (((ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb))
+ if (((ext4_has_feature_gdt_csum(sb) ||
+ ext4_has_feature_metadata_csum(sb))
&& !ext4_has_feature_csum_seed(sb))
|| ext4_has_feature_stable_inodes(sb))
return -EOPNOTSUPP;
@@ -1253,7 +1254,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (!inode_owner_or_capable(idmap, inode))
return -EPERM;
- if (ext4_has_metadata_csum(inode->i_sb)) {
+ if (ext4_has_feature_metadata_csum(inode->i_sb)) {
ext4_warning(sb, "Setting inode version is not "
"supported with metadata_csum enabled.");
return -ENOTTY;
@@ -1504,8 +1505,14 @@ resizefs_out:
return 0;
}
case EXT4_IOC_PRECACHE_EXTENTS:
- return ext4_ext_precache(inode);
+ {
+ int ret;
+ inode_lock_shared(inode);
+ ret = ext4_ext_precache(inode);
+ inode_unlock_shared(inode);
+ return ret;
+ }
case FS_IOC_SET_ENCRYPTION_POLICY:
if (!ext4_has_feature_encrypt(sb))
return -EOPNOTSUPP;
@@ -1705,7 +1712,7 @@ int ext4_update_overhead(struct super_block *sb, bool force)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (sb_rdonly(sb))
+ if (ext4_emergency_state(sb) || sb_rdonly(sb))
return 0;
if (!force &&
(sbi->s_overhead == 0 ||
diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c
index bb2a223b207c..a9416b20ff64 100644
--- a/fs/ext4/mballoc-test.c
+++ b/fs/ext4/mballoc-test.c
@@ -155,6 +155,7 @@ static struct super_block *mbt_ext4_alloc_super_block(void)
bgl_lock_init(sbi->s_blockgroup_lock);
sbi->s_es = &fsb->es;
+ sbi->s_sb = sb;
sb->s_fs_info = sbi;
up_write(&sb->s_umount);
@@ -796,11 +797,14 @@ static void test_mb_mark_used(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy);
grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp);
ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
KUNIT_ASSERT_EQ(test, ret, 0);
grp->bb_free = EXT4_CLUSTERS_PER_GROUP(sb);
+ grp->bb_largest_free_order = -1;
+ grp->bb_avg_fragment_size_order = -1;
mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
for (i = 0; i < TEST_RANGE_COUNT; i++)
test_mb_mark_used_range(test, &e4b, ranges[i].start,
@@ -860,6 +864,7 @@ static void test_mb_free_blocks(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy);
grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp);
ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
KUNIT_ASSERT_EQ(test, ret, 0);
@@ -873,6 +878,8 @@ static void test_mb_free_blocks(struct kunit *test)
ext4_unlock_group(sb, TEST_GOAL_GROUP);
grp->bb_free = 0;
+ grp->bb_largest_free_order = -1;
+ grp->bb_avg_fragment_size_order = -1;
memset(bitmap, 0xff, sb->s_blocksize);
mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b25a27c86696..5898d92ba19f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -132,25 +132,30 @@
* If "mb_optimize_scan" mount option is set, we maintain in memory group info
* structures in two data structures:
*
- * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders)
+ * 1) Array of largest free order xarrays (sbi->s_mb_largest_free_orders)
*
- * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks)
+ * Locking: Writers use xa_lock, readers use rcu_read_lock.
*
- * This is an array of lists where the index in the array represents the
+ * This is an array of xarrays where the index in the array represents the
* largest free order in the buddy bitmap of the participating group infos of
- * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total
- * number of buddy bitmap orders possible) number of lists. Group-infos are
- * placed in appropriate lists.
+ * that xarray. So, there are exactly MB_NUM_ORDERS(sb) (which means total
+ * number of buddy bitmap orders possible) number of xarrays. Group-infos are
+ * placed in appropriate xarrays.
*
- * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size)
+ * 2) Average fragment size xarrays (sbi->s_mb_avg_fragment_size)
*
- * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks)
+ * Locking: Writers use xa_lock, readers use rcu_read_lock.
*
- * This is an array of lists where in the i-th list there are groups with
+ * This is an array of xarrays where in the i-th xarray there are groups with
* average fragment size >= 2^i and < 2^(i+1). The average fragment size
* is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments.
- * Note that we don't bother with a special list for completely empty groups
- * so we only have MB_NUM_ORDERS(sb) lists.
+ * Note that we don't bother with a special xarray for completely empty
+ * groups so we only have MB_NUM_ORDERS(sb) xarrays. Group-infos are placed
+ * in appropriate xarrays.
+ *
+ * In xarray, the index is the block group number, the value is the block group
+ * information, and a non-empty value indicates the block group is present in
+ * the current xarray.
*
* When "mb_optimize_scan" mount option is set, mballoc consults the above data
* structures to decide the order in which groups are to be traversed for
@@ -187,7 +192,7 @@
* /sys/fs/ext4/<partition>/mb_min_to_scan
* /sys/fs/ext4/<partition>/mb_max_to_scan
* /sys/fs/ext4/<partition>/mb_order2_req
- * /sys/fs/ext4/<partition>/mb_linear_limit
+ * /sys/fs/ext4/<partition>/mb_max_linear_groups
*
* The regular allocator uses buddy scan only if the request len is power of
* 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
@@ -209,7 +214,7 @@
* get traversed linearly. That may result in subsequent allocations being not
* close to each other. And so, the underlying device may get filled up in a
* non-linear fashion. While that may not matter on non-rotational devices, for
- * rotational devices that may result in higher seek times. "mb_linear_limit"
+ * rotational devices that may result in higher seek times. "mb_max_linear_groups"
* tells mballoc how many groups mballoc should search linearly before
* performing consulting above data structures for more efficient lookups. For
* non rotational devices, this value defaults to 0 and for rotational devices
@@ -420,8 +425,8 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
-static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
- ext4_group_t group, enum criteria cr);
+static int ext4_mb_scan_group(struct ext4_allocation_context *ac,
+ ext4_group_t group);
static int ext4_try_to_trim_range(struct super_block *sb,
struct ext4_buddy *e4b, ext4_grpblk_t start,
@@ -841,132 +846,161 @@ static void
mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- int new_order;
+ int new, old;
- if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_fragments == 0)
+ if (!test_opt2(sb, MB_OPTIMIZE_SCAN))
return;
- new_order = mb_avg_fragment_size_order(sb,
- grp->bb_free / grp->bb_fragments);
- if (new_order == grp->bb_avg_fragment_size_order)
+ old = grp->bb_avg_fragment_size_order;
+ new = grp->bb_fragments == 0 ? -1 :
+ mb_avg_fragment_size_order(sb, grp->bb_free / grp->bb_fragments);
+ if (new == old)
return;
- if (grp->bb_avg_fragment_size_order != -1) {
- write_lock(&sbi->s_mb_avg_fragment_size_locks[
- grp->bb_avg_fragment_size_order]);
- list_del(&grp->bb_avg_fragment_size_node);
- write_unlock(&sbi->s_mb_avg_fragment_size_locks[
- grp->bb_avg_fragment_size_order]);
+ if (old >= 0)
+ xa_erase(&sbi->s_mb_avg_fragment_size[old], grp->bb_group);
+
+ grp->bb_avg_fragment_size_order = new;
+ if (new >= 0) {
+ /*
+ * Cannot use __GFP_NOFAIL because we hold the group lock.
+ * Although allocation for insertion may fails, it's not fatal
+ * as we have linear traversal to fall back on.
+ */
+ int err = xa_insert(&sbi->s_mb_avg_fragment_size[new],
+ grp->bb_group, grp, GFP_ATOMIC);
+ if (err)
+ mb_debug(sb, "insert group: %u to s_mb_avg_fragment_size[%d] failed, err %d",
+ grp->bb_group, new, err);
+ }
+}
+
+static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac,
+ struct xarray *xa,
+ ext4_group_t start, ext4_group_t end)
+{
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ enum criteria cr = ac->ac_criteria;
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+ unsigned long group = start;
+ struct ext4_group_info *grp;
+
+ if (WARN_ON_ONCE(end > ngroups || start >= end))
+ return 0;
+
+ xa_for_each_range(xa, group, grp, start, end - 1) {
+ int err;
+
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]);
+
+ err = ext4_mb_scan_group(ac, grp->bb_group);
+ if (err || ac->ac_status != AC_STATUS_CONTINUE)
+ return err;
+
+ cond_resched();
}
- grp->bb_avg_fragment_size_order = new_order;
- write_lock(&sbi->s_mb_avg_fragment_size_locks[
- grp->bb_avg_fragment_size_order]);
- list_add_tail(&grp->bb_avg_fragment_size_node,
- &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]);
- write_unlock(&sbi->s_mb_avg_fragment_size_locks[
- grp->bb_avg_fragment_size_order]);
+
+ return 0;
+}
+
+/*
+ * Find a suitable group of given order from the largest free orders xarray.
+ */
+static inline int
+ext4_mb_scan_groups_largest_free_order_range(struct ext4_allocation_context *ac,
+ int order, ext4_group_t start,
+ ext4_group_t end)
+{
+ struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_largest_free_orders[order];
+
+ if (xa_empty(xa))
+ return 0;
+
+ return ext4_mb_scan_groups_xa_range(ac, xa, start, end);
}
/*
* Choose next group by traversing largest_free_order lists. Updates *new_cr if
* cr level needs an update.
*/
-static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac,
- enum criteria *new_cr, ext4_group_t *group)
+static int ext4_mb_scan_groups_p2_aligned(struct ext4_allocation_context *ac,
+ ext4_group_t group)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- struct ext4_group_info *iter;
int i;
+ int ret = 0;
+ ext4_group_t start, end;
- if (ac->ac_status == AC_STATUS_FOUND)
- return;
-
- if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED))
- atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions);
-
+ start = group;
+ end = ext4_get_groups_count(ac->ac_sb);
+wrap_around:
for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
- if (list_empty(&sbi->s_mb_largest_free_orders[i]))
- continue;
- read_lock(&sbi->s_mb_largest_free_orders_locks[i]);
- if (list_empty(&sbi->s_mb_largest_free_orders[i])) {
- read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
- continue;
- }
- list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i],
- bb_largest_free_order_node) {
- if (sbi->s_mb_stats)
- atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]);
- if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) {
- *group = iter->bb_group;
- ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED;
- read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
- return;
- }
- }
- read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
+ ret = ext4_mb_scan_groups_largest_free_order_range(ac, i,
+ start, end);
+ if (ret || ac->ac_status != AC_STATUS_CONTINUE)
+ return ret;
+ }
+ if (start) {
+ end = start;
+ start = 0;
+ goto wrap_around;
}
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
+
/* Increment cr and search again if no group is found */
- *new_cr = CR_GOAL_LEN_FAST;
+ ac->ac_criteria = CR_GOAL_LEN_FAST;
+ return ret;
}
/*
- * Find a suitable group of given order from the average fragments list.
+ * Find a suitable group of given order from the average fragments xarray.
*/
-static struct ext4_group_info *
-ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int order)
+static int
+ext4_mb_scan_groups_avg_frag_order_range(struct ext4_allocation_context *ac,
+ int order, ext4_group_t start,
+ ext4_group_t end)
{
- struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- struct list_head *frag_list = &sbi->s_mb_avg_fragment_size[order];
- rwlock_t *frag_list_lock = &sbi->s_mb_avg_fragment_size_locks[order];
- struct ext4_group_info *grp = NULL, *iter;
- enum criteria cr = ac->ac_criteria;
+ struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_avg_fragment_size[order];
- if (list_empty(frag_list))
- return NULL;
- read_lock(frag_list_lock);
- if (list_empty(frag_list)) {
- read_unlock(frag_list_lock);
- return NULL;
- }
- list_for_each_entry(iter, frag_list, bb_avg_fragment_size_node) {
- if (sbi->s_mb_stats)
- atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]);
- if (likely(ext4_mb_good_group(ac, iter->bb_group, cr))) {
- grp = iter;
- break;
- }
- }
- read_unlock(frag_list_lock);
- return grp;
+ if (xa_empty(xa))
+ return 0;
+
+ return ext4_mb_scan_groups_xa_range(ac, xa, start, end);
}
/*
* Choose next group by traversing average fragment size list of suitable
* order. Updates *new_cr if cr level needs an update.
*/
-static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac,
- enum criteria *new_cr, ext4_group_t *group)
+static int ext4_mb_scan_groups_goal_fast(struct ext4_allocation_context *ac,
+ ext4_group_t group)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- struct ext4_group_info *grp = NULL;
- int i;
-
- if (unlikely(ac->ac_flags & EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED)) {
- if (sbi->s_mb_stats)
- atomic_inc(&sbi->s_bal_goal_fast_bad_suggestions);
+ int i, ret = 0;
+ ext4_group_t start, end;
+
+ start = group;
+ end = ext4_get_groups_count(ac->ac_sb);
+wrap_around:
+ i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
+ for (; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
+ ret = ext4_mb_scan_groups_avg_frag_order_range(ac, i,
+ start, end);
+ if (ret || ac->ac_status != AC_STATUS_CONTINUE)
+ return ret;
}
-
- for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
- i < MB_NUM_ORDERS(ac->ac_sb); i++) {
- grp = ext4_mb_find_good_group_avg_frag_lists(ac, i);
- if (grp) {
- *group = grp->bb_group;
- ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED;
- return;
- }
+ if (start) {
+ end = start;
+ start = 0;
+ goto wrap_around;
}
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
/*
* CR_BEST_AVAIL_LEN works based on the concept that we have
* a larger normalized goal len request which can be trimmed to
@@ -976,9 +1010,11 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *
* See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA).
*/
if (ac->ac_flags & EXT4_MB_HINT_DATA)
- *new_cr = CR_BEST_AVAIL_LEN;
+ ac->ac_criteria = CR_BEST_AVAIL_LEN;
else
- *new_cr = CR_GOAL_LEN_SLOW;
+ ac->ac_criteria = CR_GOAL_LEN_SLOW;
+
+ return ret;
}
/*
@@ -990,18 +1026,14 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *
* preallocations. However, we make sure that we don't trim the request too
* much and fall to CR_GOAL_LEN_SLOW in that case.
*/
-static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac,
- enum criteria *new_cr, ext4_group_t *group)
+static int ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac,
+ ext4_group_t group)
{
+ int ret = 0;
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- struct ext4_group_info *grp = NULL;
int i, order, min_order;
unsigned long num_stripe_clusters = 0;
-
- if (unlikely(ac->ac_flags & EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED)) {
- if (sbi->s_mb_stats)
- atomic_inc(&sbi->s_bal_best_avail_bad_suggestions);
- }
+ ext4_group_t start, end;
/*
* mb_avg_fragment_size_order() returns order in a way that makes
@@ -1033,6 +1065,9 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
if (1 << min_order < ac->ac_o_ex.fe_len)
min_order = fls(ac->ac_o_ex.fe_len);
+ start = group;
+ end = ext4_get_groups_count(ac->ac_sb);
+wrap_around:
for (i = order; i >= min_order; i--) {
int frag_order;
/*
@@ -1055,17 +1090,24 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
frag_order = mb_avg_fragment_size_order(ac->ac_sb,
ac->ac_g_ex.fe_len);
- grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order);
- if (grp) {
- *group = grp->bb_group;
- ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED;
- return;
- }
+ ret = ext4_mb_scan_groups_avg_frag_order_range(ac, frag_order,
+ start, end);
+ if (ret || ac->ac_status != AC_STATUS_CONTINUE)
+ return ret;
+ }
+ if (start) {
+ end = start;
+ start = 0;
+ goto wrap_around;
}
/* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */
ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
- *new_cr = CR_GOAL_LEN_SLOW;
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
+ ac->ac_criteria = CR_GOAL_LEN_SLOW;
+
+ return ret;
}
static inline int should_optimize_scan(struct ext4_allocation_context *ac)
@@ -1080,59 +1122,82 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac)
}
/*
- * Return next linear group for allocation.
+ * next linear group for allocation.
*/
-static ext4_group_t
-next_linear_group(ext4_group_t group, ext4_group_t ngroups)
+static void next_linear_group(ext4_group_t *group, ext4_group_t ngroups)
{
/*
* Artificially restricted ngroups for non-extent
* files makes group > ngroups possible on first loop.
*/
- return group + 1 >= ngroups ? 0 : group + 1;
+ *group = *group + 1 >= ngroups ? 0 : *group + 1;
}
-/*
- * ext4_mb_choose_next_group: choose next group for allocation.
- *
- * @ac Allocation Context
- * @new_cr This is an output parameter. If the there is no good group
- * available at current CR level, this field is updated to indicate
- * the new cr level that should be used.
- * @group This is an input / output parameter. As an input it indicates the
- * next group that the allocator intends to use for allocation. As
- * output, this field indicates the next group that should be used as
- * determined by the optimization functions.
- * @ngroups Total number of groups
- */
-static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
- enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+static int ext4_mb_scan_groups_linear(struct ext4_allocation_context *ac,
+ ext4_group_t ngroups, ext4_group_t *start, ext4_group_t count)
{
- *new_cr = ac->ac_criteria;
+ int ret, i;
+ enum criteria cr = ac->ac_criteria;
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_group_t group = *start;
- if (!should_optimize_scan(ac)) {
- *group = next_linear_group(*group, ngroups);
- return;
+ for (i = 0; i < count; i++, next_linear_group(&group, ngroups)) {
+ ret = ext4_mb_scan_group(ac, group);
+ if (ret || ac->ac_status != AC_STATUS_CONTINUE)
+ return ret;
+ cond_resched();
}
+ *start = group;
+ if (count == ngroups)
+ ac->ac_criteria++;
+
+ /* Processed all groups and haven't found blocks */
+ if (sbi->s_mb_stats && i == ngroups)
+ atomic64_inc(&sbi->s_bal_cX_failed[cr]);
+
+ return 0;
+}
+
+static int ext4_mb_scan_groups(struct ext4_allocation_context *ac)
+{
+ int ret = 0;
+ ext4_group_t start;
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb);
+
+ /* non-extent files are limited to low blocks/groups */
+ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
+ ngroups = sbi->s_blockfile_groups;
+
+ /* searching for the right group start from the goal value specified */
+ start = ac->ac_g_ex.fe_group;
+ ac->ac_prefetch_grp = start;
+ ac->ac_prefetch_nr = 0;
+
+ if (!should_optimize_scan(ac))
+ return ext4_mb_scan_groups_linear(ac, ngroups, &start, ngroups);
+
/*
* Optimized scanning can return non adjacent groups which can cause
* seek overhead for rotational disks. So try few linear groups before
* trying optimized scan.
*/
- if (ac->ac_groups_linear_remaining) {
- *group = next_linear_group(*group, ngroups);
- ac->ac_groups_linear_remaining--;
- return;
- }
+ if (sbi->s_mb_max_linear_groups)
+ ret = ext4_mb_scan_groups_linear(ac, ngroups, &start,
+ sbi->s_mb_max_linear_groups);
+ if (ret || ac->ac_status != AC_STATUS_CONTINUE)
+ return ret;
- if (*new_cr == CR_POWER2_ALIGNED) {
- ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group);
- } else if (*new_cr == CR_GOAL_LEN_FAST) {
- ext4_mb_choose_next_group_goal_fast(ac, new_cr, group);
- } else if (*new_cr == CR_BEST_AVAIL_LEN) {
- ext4_mb_choose_next_group_best_avail(ac, new_cr, group);
- } else {
+ switch (ac->ac_criteria) {
+ case CR_POWER2_ALIGNED:
+ return ext4_mb_scan_groups_p2_aligned(ac, start);
+ case CR_GOAL_LEN_FAST:
+ return ext4_mb_scan_groups_goal_fast(ac, start);
+ case CR_BEST_AVAIL_LEN:
+ return ext4_mb_scan_groups_best_avail(ac, start);
+ default:
/*
* TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an
* rb tree sorted by bb_free. But until that happens, we should
@@ -1140,6 +1205,8 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
*/
WARN_ON(1);
}
+
+ return 0;
}
/*
@@ -1150,33 +1217,35 @@ static void
mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- int i;
+ int new, old = grp->bb_largest_free_order;
- for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--)
- if (grp->bb_counters[i] > 0)
+ for (new = MB_NUM_ORDERS(sb) - 1; new >= 0; new--)
+ if (grp->bb_counters[new] > 0)
break;
+
/* No need to move between order lists? */
- if (!test_opt2(sb, MB_OPTIMIZE_SCAN) ||
- i == grp->bb_largest_free_order) {
- grp->bb_largest_free_order = i;
+ if (new == old)
return;
- }
- if (grp->bb_largest_free_order >= 0) {
- write_lock(&sbi->s_mb_largest_free_orders_locks[
- grp->bb_largest_free_order]);
- list_del_init(&grp->bb_largest_free_order_node);
- write_unlock(&sbi->s_mb_largest_free_orders_locks[
- grp->bb_largest_free_order]);
+ if (old >= 0) {
+ struct xarray *xa = &sbi->s_mb_largest_free_orders[old];
+
+ if (!xa_empty(xa) && xa_load(xa, grp->bb_group))
+ xa_erase(xa, grp->bb_group);
}
- grp->bb_largest_free_order = i;
- if (grp->bb_largest_free_order >= 0 && grp->bb_free) {
- write_lock(&sbi->s_mb_largest_free_orders_locks[
- grp->bb_largest_free_order]);
- list_add_tail(&grp->bb_largest_free_order_node,
- &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]);
- write_unlock(&sbi->s_mb_largest_free_orders_locks[
- grp->bb_largest_free_order]);
+
+ grp->bb_largest_free_order = new;
+ if (test_opt2(sb, MB_OPTIMIZE_SCAN) && new >= 0 && grp->bb_free) {
+ /*
+ * Cannot use __GFP_NOFAIL because we hold the group lock.
+ * Although allocation for insertion may fails, it's not fatal
+ * as we have linear traversal to fall back on.
+ */
+ int err = xa_insert(&sbi->s_mb_largest_free_orders[new],
+ grp->bb_group, grp, GFP_ATOMIC);
+ if (err)
+ mb_debug(sb, "insert group: %u to s_mb_largest_free_orders[%d] failed, err %d",
+ grp->bb_group, new, err);
}
}
@@ -2167,11 +2236,11 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
folio_get(ac->ac_buddy_folio);
/* store last allocated for subsequent stream allocation */
if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
- spin_lock(&sbi->s_md_lock);
- sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
- sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
- spin_unlock(&sbi->s_md_lock);
+ int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals;
+
+ WRITE_ONCE(sbi->s_mb_last_groups[hash], ac->ac_f_ex.fe_group);
}
+
/*
* As we've just preallocated more space than
* user requested originally, we store allocated
@@ -2571,6 +2640,30 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
}
}
+static void __ext4_mb_scan_group(struct ext4_allocation_context *ac)
+{
+ bool is_stripe_aligned;
+ struct ext4_sb_info *sbi;
+ enum criteria cr = ac->ac_criteria;
+
+ ac->ac_groups_scanned++;
+ if (cr == CR_POWER2_ALIGNED)
+ return ext4_mb_simple_scan_group(ac, ac->ac_e4b);
+
+ sbi = EXT4_SB(ac->ac_sb);
+ is_stripe_aligned = false;
+ if ((sbi->s_stripe >= sbi->s_cluster_ratio) &&
+ !(ac->ac_g_ex.fe_len % EXT4_NUM_B2C(sbi, sbi->s_stripe)))
+ is_stripe_aligned = true;
+
+ if ((cr == CR_GOAL_LEN_FAST || cr == CR_BEST_AVAIL_LEN) &&
+ is_stripe_aligned)
+ ext4_mb_scan_aligned(ac, ac->ac_e4b);
+
+ if (ac->ac_status == AC_STATUS_CONTINUE)
+ ext4_mb_complex_scan_group(ac, ac->ac_e4b);
+}
+
/*
* This is also called BEFORE we load the buddy bitmap.
* Returns either 1 or 0 indicating that the group is either suitable
@@ -2761,6 +2854,37 @@ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
}
/*
+ * Batch reads of the block allocation bitmaps to get
+ * multiple READs in flight; limit prefetching at inexpensive
+ * CR, otherwise mballoc can spend a lot of time loading
+ * imperfect groups
+ */
+static void ext4_mb_might_prefetch(struct ext4_allocation_context *ac,
+ ext4_group_t group)
+{
+ struct ext4_sb_info *sbi;
+
+ if (ac->ac_prefetch_grp != group)
+ return;
+
+ sbi = EXT4_SB(ac->ac_sb);
+ if (ext4_mb_cr_expensive(ac->ac_criteria) ||
+ ac->ac_prefetch_ios < sbi->s_mb_prefetch_limit) {
+ unsigned int nr = sbi->s_mb_prefetch;
+
+ if (ext4_has_feature_flex_bg(ac->ac_sb)) {
+ nr = 1 << sbi->s_log_groups_per_flex;
+ nr -= group & (nr - 1);
+ nr = umin(nr, sbi->s_mb_prefetch);
+ }
+
+ ac->ac_prefetch_nr = nr;
+ ac->ac_prefetch_grp = ext4_mb_prefetch(ac->ac_sb, group, nr,
+ &ac->ac_prefetch_ios);
+ }
+}
+
+/*
* Prefetching reads the block bitmap into the buffer cache; but we
* need to make sure that the buddy bitmap in the page cache has been
* initialized. Note that ext4_mb_init_group() will block if the I/O
@@ -2793,24 +2917,58 @@ void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
}
}
+static int ext4_mb_scan_group(struct ext4_allocation_context *ac,
+ ext4_group_t group)
+{
+ int ret;
+ struct super_block *sb = ac->ac_sb;
+ enum criteria cr = ac->ac_criteria;
+
+ ext4_mb_might_prefetch(ac, group);
+
+ /* prevent unnecessary buddy loading. */
+ if (cr < CR_ANY_FREE && spin_is_locked(ext4_group_lock_ptr(sb, group)))
+ return 0;
+
+ /* This now checks without needing the buddy page */
+ ret = ext4_mb_good_group_nolock(ac, group, cr);
+ if (ret <= 0) {
+ if (!ac->ac_first_err)
+ ac->ac_first_err = ret;
+ return 0;
+ }
+
+ ret = ext4_mb_load_buddy(sb, group, ac->ac_e4b);
+ if (ret)
+ return ret;
+
+ /* skip busy group */
+ if (cr >= CR_ANY_FREE)
+ ext4_lock_group(sb, group);
+ else if (!ext4_try_lock_group(sb, group))
+ goto out_unload;
+
+ /* We need to check again after locking the block group. */
+ if (unlikely(!ext4_mb_good_group(ac, group, cr)))
+ goto out_unlock;
+
+ __ext4_mb_scan_group(ac);
+
+out_unlock:
+ ext4_unlock_group(sb, group);
+out_unload:
+ ext4_mb_unload_buddy(ac->ac_e4b);
+ return ret;
+}
+
static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
- ext4_group_t prefetch_grp = 0, ngroups, group, i;
- enum criteria new_cr, cr = CR_GOAL_LEN_FAST;
- int err = 0, first_err = 0;
- unsigned int nr = 0, prefetch_ios = 0;
- struct ext4_sb_info *sbi;
- struct super_block *sb;
+ ext4_group_t i;
+ int err = 0;
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_buddy e4b;
- int lost;
-
- sb = ac->ac_sb;
- sbi = EXT4_SB(sb);
- ngroups = ext4_get_groups_count(sb);
- /* non-extent files are limited to low blocks/groups */
- if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
- ngroups = sbi->s_blockfile_groups;
BUG_ON(ac->ac_status == AC_STATUS_FOUND);
@@ -2844,11 +3002,11 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
/* if stream allocation is enabled, use global goal */
if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
- /* TBD: may be hot point */
- spin_lock(&sbi->s_md_lock);
- ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
- ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
- spin_unlock(&sbi->s_md_lock);
+ int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals;
+
+ ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]);
+ ac->ac_g_ex.fe_start = -1;
+ ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL;
}
/*
@@ -2856,107 +3014,21 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
* start with CR_GOAL_LEN_FAST, unless it is power of 2
* aligned, in which case let's do that faster approach first.
*/
+ ac->ac_criteria = CR_GOAL_LEN_FAST;
if (ac->ac_2order)
- cr = CR_POWER2_ALIGNED;
-repeat:
- for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
- ac->ac_criteria = cr;
- /*
- * searching for the right group start
- * from the goal value specified
- */
- group = ac->ac_g_ex.fe_group;
- ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
- prefetch_grp = group;
- nr = 0;
-
- for (i = 0, new_cr = cr; i < ngroups; i++,
- ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) {
- int ret = 0;
-
- cond_resched();
- if (new_cr != cr) {
- cr = new_cr;
- goto repeat;
- }
-
- /*
- * Batch reads of the block allocation bitmaps
- * to get multiple READs in flight; limit
- * prefetching at inexpensive CR, otherwise mballoc
- * can spend a lot of time loading imperfect groups
- */
- if ((prefetch_grp == group) &&
- (ext4_mb_cr_expensive(cr) ||
- prefetch_ios < sbi->s_mb_prefetch_limit)) {
- nr = sbi->s_mb_prefetch;
- if (ext4_has_feature_flex_bg(sb)) {
- nr = 1 << sbi->s_log_groups_per_flex;
- nr -= group & (nr - 1);
- nr = min(nr, sbi->s_mb_prefetch);
- }
- prefetch_grp = ext4_mb_prefetch(sb, group,
- nr, &prefetch_ios);
- }
-
- /* This now checks without needing the buddy page */
- ret = ext4_mb_good_group_nolock(ac, group, cr);
- if (ret <= 0) {
- if (!first_err)
- first_err = ret;
- continue;
- }
-
- err = ext4_mb_load_buddy(sb, group, &e4b);
- if (err)
- goto out;
-
- ext4_lock_group(sb, group);
-
- /*
- * We need to check again after locking the
- * block group
- */
- ret = ext4_mb_good_group(ac, group, cr);
- if (ret == 0) {
- ext4_unlock_group(sb, group);
- ext4_mb_unload_buddy(&e4b);
- continue;
- }
+ ac->ac_criteria = CR_POWER2_ALIGNED;
- ac->ac_groups_scanned++;
- if (cr == CR_POWER2_ALIGNED)
- ext4_mb_simple_scan_group(ac, &e4b);
- else {
- bool is_stripe_aligned =
- (sbi->s_stripe >=
- sbi->s_cluster_ratio) &&
- !(ac->ac_g_ex.fe_len %
- EXT4_NUM_B2C(sbi, sbi->s_stripe));
-
- if ((cr == CR_GOAL_LEN_FAST ||
- cr == CR_BEST_AVAIL_LEN) &&
- is_stripe_aligned)
- ext4_mb_scan_aligned(ac, &e4b);
-
- if (ac->ac_status == AC_STATUS_CONTINUE)
- ext4_mb_complex_scan_group(ac, &e4b);
- }
-
- ext4_unlock_group(sb, group);
- ext4_mb_unload_buddy(&e4b);
-
- if (ac->ac_status != AC_STATUS_CONTINUE)
- break;
- }
- /* Processed all groups and haven't found blocks */
- if (sbi->s_mb_stats && i == ngroups)
- atomic64_inc(&sbi->s_bal_cX_failed[cr]);
+ ac->ac_e4b = &e4b;
+ ac->ac_prefetch_ios = 0;
+ ac->ac_first_err = 0;
+repeat:
+ while (ac->ac_criteria < EXT4_MB_NUM_CRS) {
+ err = ext4_mb_scan_groups(ac);
+ if (err)
+ goto out;
- if (i == ngroups && ac->ac_criteria == CR_BEST_AVAIL_LEN)
- /* Reset goal length to original goal length before
- * falling into CR_GOAL_LEN_SLOW */
- ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
+ if (ac->ac_status != AC_STATUS_CONTINUE)
+ break;
}
if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
@@ -2967,6 +3039,8 @@ repeat:
*/
ext4_mb_try_best_found(ac, &e4b);
if (ac->ac_status != AC_STATUS_FOUND) {
+ int lost;
+
/*
* Someone more lucky has already allocated it.
* The only thing we can do is just take first
@@ -2982,23 +3056,27 @@ repeat:
ac->ac_b_ex.fe_len = 0;
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_flags |= EXT4_MB_HINT_FIRST;
- cr = CR_ANY_FREE;
+ ac->ac_criteria = CR_ANY_FREE;
goto repeat;
}
}
- if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND)
+ if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) {
atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
+ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC &&
+ ac->ac_b_ex.fe_group == ac->ac_g_ex.fe_group)
+ atomic_inc(&sbi->s_bal_stream_goals);
+ }
out:
- if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
- err = first_err;
+ if (!err && ac->ac_status != AC_STATUS_FOUND && ac->ac_first_err)
+ err = ac->ac_first_err;
mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
- ac->ac_flags, cr, err);
+ ac->ac_flags, ac->ac_criteria, err);
- if (nr)
- ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
+ if (ac->ac_prefetch_nr)
+ ext4_mb_prefetch_fini(sb, ac->ac_prefetch_grp, ac->ac_prefetch_nr);
return err;
}
@@ -3037,10 +3115,8 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
unsigned char blocksize_bits = min_t(unsigned char,
sb->s_blocksize_bits,
EXT4_MAX_BLOCK_LOG_SIZE);
- struct sg {
- struct ext4_group_info info;
- ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
- } sg;
+ DEFINE_RAW_FLEX(struct ext4_group_info, sg, bb_counters,
+ EXT4_MAX_BLOCK_LOG_SIZE + 2);
group--;
if (group == 0)
@@ -3048,7 +3124,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
" 2^0 2^1 2^2 2^3 2^4 2^5 2^6 "
" 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n");
- i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
+ i = (blocksize_bits + 2) * sizeof(sg->bb_counters[0]) +
sizeof(struct ext4_group_info);
grinfo = ext4_get_group_info(sb, group);
@@ -3068,14 +3144,14 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
* We care only about free space counters in the group info and
* these are safe to access even after the buddy has been unloaded
*/
- memcpy(&sg, grinfo, i);
- seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
- sg.info.bb_fragments, sg.info.bb_first_free);
+ memcpy(sg, grinfo, i);
+ seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg->bb_free,
+ sg->bb_fragments, sg->bb_first_free);
for (i = 0; i <= 13; i++)
seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
- sg.info.bb_counters[i] : 0);
+ sg->bb_counters[i] : 0);
seq_puts(seq, " ]");
- if (EXT4_MB_GRP_BBITMAP_CORRUPT(&sg.info))
+ if (EXT4_MB_GRP_BBITMAP_CORRUPT(sg))
seq_puts(seq, " Block bitmap corrupted!");
seq_putc(seq, '\n');
return 0;
@@ -3123,8 +3199,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED]));
seq_printf(seq, "\t\tuseless_loops: %llu\n",
atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED]));
- seq_printf(seq, "\t\tbad_suggestions: %u\n",
- atomic_read(&sbi->s_bal_p2_aligned_bad_suggestions));
/* CR_GOAL_LEN_FAST stats */
seq_puts(seq, "\tcr_goal_fast_stats:\n");
@@ -3137,8 +3211,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST]));
seq_printf(seq, "\t\tuseless_loops: %llu\n",
atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST]));
- seq_printf(seq, "\t\tbad_suggestions: %u\n",
- atomic_read(&sbi->s_bal_goal_fast_bad_suggestions));
/* CR_BEST_AVAIL_LEN stats */
seq_puts(seq, "\tcr_best_avail_stats:\n");
@@ -3152,8 +3224,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN]));
seq_printf(seq, "\t\tuseless_loops: %llu\n",
atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN]));
- seq_printf(seq, "\t\tbad_suggestions: %u\n",
- atomic_read(&sbi->s_bal_best_avail_bad_suggestions));
/* CR_GOAL_LEN_SLOW stats */
seq_puts(seq, "\tcr_goal_slow_stats:\n");
@@ -3183,6 +3253,8 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
seq_printf(seq, "\textents_scanned: %u\n",
atomic_read(&sbi->s_bal_ex_scanned));
seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
+ seq_printf(seq, "\t\tstream_goal_hits: %u\n",
+ atomic_read(&sbi->s_bal_stream_goals));
seq_printf(seq, "\t\tlen_goal_hits: %u\n",
atomic_read(&sbi->s_bal_len_goals));
seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
@@ -3229,6 +3301,7 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
unsigned long position = ((unsigned long) v);
struct ext4_group_info *grp;
unsigned int count;
+ unsigned long idx;
position--;
if (position >= MB_NUM_ORDERS(sb)) {
@@ -3237,11 +3310,8 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
seq_puts(seq, "avg_fragment_size_lists:\n");
count = 0;
- read_lock(&sbi->s_mb_avg_fragment_size_locks[position]);
- list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position],
- bb_avg_fragment_size_node)
+ xa_for_each(&sbi->s_mb_avg_fragment_size[position], idx, grp)
count++;
- read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]);
seq_printf(seq, "\tlist_order_%u_groups: %u\n",
(unsigned int)position, count);
return 0;
@@ -3253,11 +3323,8 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
seq_puts(seq, "max_free_order_lists:\n");
}
count = 0;
- read_lock(&sbi->s_mb_largest_free_orders_locks[position]);
- list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position],
- bb_largest_free_order_node)
+ xa_for_each(&sbi->s_mb_largest_free_orders[position], idx, grp)
count++;
- read_unlock(&sbi->s_mb_largest_free_orders_locks[position]);
seq_printf(seq, "\tlist_order_%u_groups: %u\n",
(unsigned int)position, count);
@@ -3377,8 +3444,6 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
init_rwsem(&meta_group_info[i]->alloc_sem);
meta_group_info[i]->bb_free_root = RB_ROOT;
- INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node);
- INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node);
meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */
meta_group_info[i]->bb_group = group;
@@ -3588,6 +3653,20 @@ static void ext4_discard_work(struct work_struct *work)
ext4_mb_unload_buddy(&e4b);
}
+static inline void ext4_mb_avg_fragment_size_destroy(struct ext4_sb_info *sbi)
+{
+ for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++)
+ xa_destroy(&sbi->s_mb_avg_fragment_size[i]);
+ kfree(sbi->s_mb_avg_fragment_size);
+}
+
+static inline void ext4_mb_largest_free_orders_destroy(struct ext4_sb_info *sbi)
+{
+ for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++)
+ xa_destroy(&sbi->s_mb_largest_free_orders[i]);
+ kfree(sbi->s_mb_largest_free_orders);
+}
+
int ext4_mb_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3633,44 +3712,27 @@ int ext4_mb_init(struct super_block *sb)
} while (i < MB_NUM_ORDERS(sb));
sbi->s_mb_avg_fragment_size =
- kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
+ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct xarray),
GFP_KERNEL);
if (!sbi->s_mb_avg_fragment_size) {
ret = -ENOMEM;
goto out;
}
- sbi->s_mb_avg_fragment_size_locks =
- kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
- GFP_KERNEL);
- if (!sbi->s_mb_avg_fragment_size_locks) {
- ret = -ENOMEM;
- goto out;
- }
- for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
- INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]);
- rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]);
- }
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+ xa_init(&sbi->s_mb_avg_fragment_size[i]);
+
sbi->s_mb_largest_free_orders =
- kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
+ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct xarray),
GFP_KERNEL);
if (!sbi->s_mb_largest_free_orders) {
ret = -ENOMEM;
goto out;
}
- sbi->s_mb_largest_free_orders_locks =
- kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
- GFP_KERNEL);
- if (!sbi->s_mb_largest_free_orders_locks) {
- ret = -ENOMEM;
- goto out;
- }
- for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
- INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]);
- rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]);
- }
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+ xa_init(&sbi->s_mb_largest_free_orders[i]);
spin_lock_init(&sbi->s_md_lock);
- sbi->s_mb_free_pending = 0;
+ atomic_set(&sbi->s_mb_free_pending, 0);
INIT_LIST_HEAD(&sbi->s_freed_data_list[0]);
INIT_LIST_HEAD(&sbi->s_freed_data_list[1]);
INIT_LIST_HEAD(&sbi->s_discard_list);
@@ -3711,10 +3773,19 @@ int ext4_mb_init(struct super_block *sb)
sbi->s_mb_group_prealloc, EXT4_NUM_B2C(sbi, sbi->s_stripe));
}
+ sbi->s_mb_nr_global_goals = umin(num_possible_cpus(),
+ DIV_ROUND_UP(sbi->s_groups_count, 4));
+ sbi->s_mb_last_groups = kcalloc(sbi->s_mb_nr_global_goals,
+ sizeof(ext4_group_t), GFP_KERNEL);
+ if (sbi->s_mb_last_groups == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
if (sbi->s_locality_groups == NULL) {
ret = -ENOMEM;
- goto out;
+ goto out_free_last_groups;
}
for_each_possible_cpu(i) {
struct ext4_locality_group *lg;
@@ -3739,11 +3810,12 @@ int ext4_mb_init(struct super_block *sb)
out_free_locality_groups:
free_percpu(sbi->s_locality_groups);
sbi->s_locality_groups = NULL;
+out_free_last_groups:
+ kfree(sbi->s_mb_last_groups);
+ sbi->s_mb_last_groups = NULL;
out:
- kfree(sbi->s_mb_avg_fragment_size);
- kfree(sbi->s_mb_avg_fragment_size_locks);
- kfree(sbi->s_mb_largest_free_orders);
- kfree(sbi->s_mb_largest_free_orders_locks);
+ ext4_mb_avg_fragment_size_destroy(sbi);
+ ext4_mb_largest_free_orders_destroy(sbi);
kfree(sbi->s_mb_offsets);
sbi->s_mb_offsets = NULL;
kfree(sbi->s_mb_maxs);
@@ -3810,10 +3882,8 @@ void ext4_mb_release(struct super_block *sb)
kvfree(group_info);
rcu_read_unlock();
}
- kfree(sbi->s_mb_avg_fragment_size);
- kfree(sbi->s_mb_avg_fragment_size_locks);
- kfree(sbi->s_mb_largest_free_orders);
- kfree(sbi->s_mb_largest_free_orders_locks);
+ ext4_mb_avg_fragment_size_destroy(sbi);
+ ext4_mb_largest_free_orders_destroy(sbi);
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
iput(sbi->s_buddy_cache);
@@ -3843,6 +3913,7 @@ void ext4_mb_release(struct super_block *sb)
}
free_percpu(sbi->s_locality_groups);
+ kfree(sbi->s_mb_last_groups);
}
static inline int ext4_issue_discard(struct super_block *sb,
@@ -3873,10 +3944,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
/* we expect to find existing buddy because it's pinned */
BUG_ON(err != 0);
- spin_lock(&EXT4_SB(sb)->s_md_lock);
- EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count;
- spin_unlock(&EXT4_SB(sb)->s_md_lock);
-
+ atomic_sub(entry->efd_count, &EXT4_SB(sb)->s_mb_free_pending);
db = e4b.bd_info;
/* there are blocks to put in buddy to make them really free */
count += entry->efd_count;
@@ -5653,7 +5721,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb)
{
ext4_group_t i, ngroups;
- if (ext4_forced_shutdown(sb))
+ if (ext4_emergency_state(sb))
return;
ngroups = ext4_get_groups_count(sb);
@@ -5687,7 +5755,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
- if (ext4_forced_shutdown(sb))
+ if (ext4_emergency_state(sb))
return;
mb_debug(sb, "Can't allocate:"
@@ -6280,28 +6348,63 @@ out:
* are contiguous, AND the extents were freed by the same transaction,
* AND the blocks are associated with the same group.
*/
-static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
- struct ext4_free_data *entry,
- struct ext4_free_data *new_entry,
- struct rb_root *entry_rb_root)
+static inline bool
+ext4_freed_extents_can_be_merged(struct ext4_free_data *entry1,
+ struct ext4_free_data *entry2)
{
- if ((entry->efd_tid != new_entry->efd_tid) ||
- (entry->efd_group != new_entry->efd_group))
- return;
- if (entry->efd_start_cluster + entry->efd_count ==
- new_entry->efd_start_cluster) {
- new_entry->efd_start_cluster = entry->efd_start_cluster;
- new_entry->efd_count += entry->efd_count;
- } else if (new_entry->efd_start_cluster + new_entry->efd_count ==
- entry->efd_start_cluster) {
- new_entry->efd_count += entry->efd_count;
- } else
- return;
+ if (entry1->efd_tid != entry2->efd_tid)
+ return false;
+ if (entry1->efd_start_cluster + entry1->efd_count !=
+ entry2->efd_start_cluster)
+ return false;
+ if (WARN_ON_ONCE(entry1->efd_group != entry2->efd_group))
+ return false;
+ return true;
+}
+
+static inline void
+ext4_merge_freed_extents(struct ext4_sb_info *sbi, struct rb_root *root,
+ struct ext4_free_data *entry1,
+ struct ext4_free_data *entry2)
+{
+ entry1->efd_count += entry2->efd_count;
spin_lock(&sbi->s_md_lock);
- list_del(&entry->efd_list);
+ list_del(&entry2->efd_list);
spin_unlock(&sbi->s_md_lock);
- rb_erase(&entry->efd_node, entry_rb_root);
- kmem_cache_free(ext4_free_data_cachep, entry);
+ rb_erase(&entry2->efd_node, root);
+ kmem_cache_free(ext4_free_data_cachep, entry2);
+}
+
+static inline void
+ext4_try_merge_freed_extent_prev(struct ext4_sb_info *sbi, struct rb_root *root,
+ struct ext4_free_data *entry)
+{
+ struct ext4_free_data *prev;
+ struct rb_node *node;
+
+ node = rb_prev(&entry->efd_node);
+ if (!node)
+ return;
+
+ prev = rb_entry(node, struct ext4_free_data, efd_node);
+ if (ext4_freed_extents_can_be_merged(prev, entry))
+ ext4_merge_freed_extents(sbi, root, prev, entry);
+}
+
+static inline void
+ext4_try_merge_freed_extent_next(struct ext4_sb_info *sbi, struct rb_root *root,
+ struct ext4_free_data *entry)
+{
+ struct ext4_free_data *next;
+ struct rb_node *node;
+
+ node = rb_next(&entry->efd_node);
+ if (!node)
+ return;
+
+ next = rb_entry(node, struct ext4_free_data, efd_node);
+ if (ext4_freed_extents_can_be_merged(entry, next))
+ ext4_merge_freed_extents(sbi, root, entry, next);
}
static noinline_for_stack void
@@ -6311,11 +6414,12 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
ext4_group_t group = e4b->bd_group;
ext4_grpblk_t cluster;
ext4_grpblk_t clusters = new_entry->efd_count;
- struct ext4_free_data *entry;
+ struct ext4_free_data *entry = NULL;
struct ext4_group_info *db = e4b->bd_info;
struct super_block *sb = e4b->bd_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct rb_node **n = &db->bb_free_root.rb_node, *node;
+ struct rb_root *root = &db->bb_free_root;
+ struct rb_node **n = &root->rb_node;
struct rb_node *parent = NULL, *new_node;
BUG_ON(!ext4_handle_valid(handle));
@@ -6351,27 +6455,30 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
}
}
- rb_link_node(new_node, parent, n);
- rb_insert_color(new_node, &db->bb_free_root);
+ atomic_add(clusters, &sbi->s_mb_free_pending);
+ if (!entry)
+ goto insert;
- /* Now try to see the extent can be merged to left and right */
- node = rb_prev(new_node);
- if (node) {
- entry = rb_entry(node, struct ext4_free_data, efd_node);
- ext4_try_merge_freed_extent(sbi, entry, new_entry,
- &(db->bb_free_root));
+ /* Now try to see the extent can be merged to prev and next */
+ if (ext4_freed_extents_can_be_merged(new_entry, entry)) {
+ entry->efd_start_cluster = cluster;
+ entry->efd_count += new_entry->efd_count;
+ kmem_cache_free(ext4_free_data_cachep, new_entry);
+ ext4_try_merge_freed_extent_prev(sbi, root, entry);
+ return;
}
-
- node = rb_next(new_node);
- if (node) {
- entry = rb_entry(node, struct ext4_free_data, efd_node);
- ext4_try_merge_freed_extent(sbi, entry, new_entry,
- &(db->bb_free_root));
+ if (ext4_freed_extents_can_be_merged(entry, new_entry)) {
+ entry->efd_count += new_entry->efd_count;
+ kmem_cache_free(ext4_free_data_cachep, new_entry);
+ ext4_try_merge_freed_extent_next(sbi, root, entry);
+ return;
}
+insert:
+ rb_link_node(new_node, parent, n);
+ rb_insert_color(new_node, root);
spin_lock(&sbi->s_md_lock);
list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]);
- sbi->s_mb_free_pending += clusters;
spin_unlock(&sbi->s_md_lock);
}
@@ -6644,7 +6751,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
for (i = 0; i < count; i++) {
cond_resched();
if (is_metadata)
- bh = sb_find_get_block(inode->i_sb, block + i);
+ bh = sb_find_get_block_nonatomic(inode->i_sb,
+ block + i);
ext4_forget(handle, is_metadata, inode, bh, block + i);
}
}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index f8280de3e882..15a049f05d04 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -192,8 +192,13 @@ struct ext4_allocation_context {
*/
ext4_grpblk_t ac_orig_goal_len;
+ ext4_group_t ac_prefetch_grp;
+ unsigned int ac_prefetch_ios;
+ unsigned int ac_prefetch_nr;
+
+ int ac_first_err;
+
__u32 ac_flags; /* allocation hints */
- __u32 ac_groups_linear_remaining;
__u16 ac_groups_scanned;
__u16 ac_found;
__u16 ac_cX_found[EXT4_MB_NUM_CRS];
@@ -204,6 +209,8 @@ struct ext4_allocation_context {
__u8 ac_2order; /* if request is to allocate 2^N blocks and
* N > 0, the field stores N, otherwise 0 */
__u8 ac_op; /* operation, for history only */
+
+ struct ext4_buddy *ac_e4b;
struct folio *ac_bitmap_folio;
struct folio *ac_buddy_folio;
struct ext4_prealloc_space *ac_pa;
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index d64c04ed061a..51661570cf3b 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -14,14 +14,14 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
int offset = offsetof(struct mmp_struct, mmp_checksum);
__u32 csum;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset);
+ csum = ext4_chksum(sbi->s_csum_seed, (char *)mmp, offset);
return cpu_to_le32(csum);
}
static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
{
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return 1;
return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
@@ -29,7 +29,7 @@ static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
{
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return;
mmp->mmp_checksum = ext4_mmp_csum(sb, mmp);
@@ -162,7 +162,7 @@ static int kmmpd(void *data)
memcpy(mmp->mmp_nodename, init_utsname()->nodename,
sizeof(mmp->mmp_nodename));
- while (!kthread_should_stop() && !ext4_forced_shutdown(sb)) {
+ while (!kthread_should_stop() && !ext4_emergency_state(sb)) {
if (!ext4_has_feature_mmp(sb)) {
ext4_warning(sb, "kmmpd being stopped since MMP feature"
" has been disabled.");
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 898443e98efc..adae3caf175a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -269,7 +269,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
unsigned int tmp_data_size, data_size, replaced_size;
int i, err2, jblocks, retries = 0;
int replaced_count = 0;
- int from = data_offset_in_page << orig_inode->i_blkbits;
+ int from;
int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits;
struct super_block *sb = orig_inode->i_sb;
struct buffer_head *bh = NULL;
@@ -280,7 +280,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
*/
again:
*err = 0;
- jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
+ jblocks = ext4_meta_trans_blocks(orig_inode, block_len_in_page,
+ block_len_in_page) * 2;
handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks);
if (IS_ERR(handle)) {
*err = PTR_ERR(handle);
@@ -323,11 +324,6 @@ again:
* hold page's lock, if it is still the case data copy is not
* necessary, just swap data blocks between orig and donor.
*/
-
- VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]);
- VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]);
- VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]);
-
if (unwritten) {
ext4_double_down_write_data_sem(orig_inode, donor_inode);
/* If any of extents in range became initialized we have to
@@ -360,6 +356,8 @@ again:
goto unlock_folios;
}
data_copy:
+ from = offset_in_folio(folio[0],
+ orig_blk_offset << orig_inode->i_blkbits);
*err = mext_page_mkuptodate(folio[0], from, from + replaced_size);
if (*err)
goto unlock_folios;
@@ -390,7 +388,7 @@ data_copy:
if (!bh)
bh = create_empty_buffers(folio[0],
1 << orig_inode->i_blkbits, 0);
- for (i = 0; i < data_offset_in_page; i++)
+ for (i = 0; i < from >> orig_inode->i_blkbits; i++)
bh = bh->b_this_page;
for (i = 0; i < block_len_in_page; i++) {
*err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0);
@@ -399,7 +397,7 @@ data_copy:
bh = bh->b_this_page;
}
- block_commit_write(&folio[0]->page, from, from + replaced_size);
+ block_commit_write(folio[0], from, from + replaced_size);
/* Even in case of data=writeback it is reasonable to pin
* inode to transaction, to prevent unexpected data loss */
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 536d56d15072..2cd36f59c9e3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -176,7 +176,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
brelse(bh);
return ERR_PTR(-EFSCORRUPTED);
}
- if (!ext4_has_metadata_csum(inode->i_sb) ||
+ if (!ext4_has_feature_metadata_csum(inode->i_sb) ||
buffer_verified(bh))
return bh;
@@ -291,36 +291,6 @@ struct dx_tail {
__le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */
};
-static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
-static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
-static inline unsigned dx_get_hash(struct dx_entry *entry);
-static void dx_set_hash(struct dx_entry *entry, unsigned value);
-static unsigned dx_get_count(struct dx_entry *entries);
-static unsigned dx_get_limit(struct dx_entry *entries);
-static void dx_set_count(struct dx_entry *entries, unsigned value);
-static void dx_set_limit(struct dx_entry *entries, unsigned value);
-static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
-static unsigned dx_node_limit(struct inode *dir);
-static struct dx_frame *dx_probe(struct ext4_filename *fname,
- struct inode *dir,
- struct dx_hash_info *hinfo,
- struct dx_frame *frame);
-static void dx_release(struct dx_frame *frames);
-static int dx_make_map(struct inode *dir, struct buffer_head *bh,
- struct dx_hash_info *hinfo,
- struct dx_map_entry *map_tail);
-static void dx_sort_map(struct dx_map_entry *map, unsigned count);
-static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from,
- char *to, struct dx_map_entry *offsets,
- int count, unsigned int blocksize);
-static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base,
- unsigned int blocksize);
-static void dx_insert_block(struct dx_frame *frame,
- u32 hash, ext4_lblk_t block);
-static int ext4_htree_next_block(struct inode *dir, __u32 hash,
- struct dx_frame *frame,
- struct dx_frame *frames,
- __u32 *start_hash);
static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **res_dir);
@@ -376,11 +346,10 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
static __le32 ext4_dirblock_csum(struct inode *inode, void *dirent, int size)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
__u32 csum;
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
+ csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size);
return cpu_to_le32(csum);
}
@@ -398,7 +367,7 @@ int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh)
{
struct ext4_dir_entry_tail *t;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return 1;
t = get_dirent_tail(inode, bh);
@@ -419,7 +388,7 @@ static void ext4_dirblock_csum_set(struct inode *inode,
{
struct ext4_dir_entry_tail *t;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return;
t = get_dirent_tail(inode, bh);
@@ -472,7 +441,6 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
int count_offset, int count, struct dx_tail *t)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
__u32 csum;
int size;
@@ -480,9 +448,9 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
int offset = offsetof(struct dx_tail, dt_checksum);
size = count_offset + (count * sizeof(struct dx_entry));
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
- csum = ext4_chksum(sbi, csum, (__u8 *)t, offset);
- csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
+ csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size);
+ csum = ext4_chksum(csum, (__u8 *)t, offset);
+ csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
return cpu_to_le32(csum);
}
@@ -494,7 +462,7 @@ static int ext4_dx_csum_verify(struct inode *inode,
struct dx_tail *t;
int count_offset, limit, count;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return 1;
c = get_dx_countlimit(inode, dirent, &count_offset);
@@ -523,7 +491,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
struct dx_tail *t;
int count_offset, limit, count;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return;
c = get_dx_countlimit(inode, dirent, &count_offset);
@@ -612,7 +580,7 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
ext4_dir_rec_len(1, NULL) -
ext4_dir_rec_len(2, NULL) - infosize;
- if (ext4_has_metadata_csum(dir->i_sb))
+ if (ext4_has_feature_metadata_csum(dir->i_sb))
entry_space -= sizeof(struct dx_tail);
return entry_space / sizeof(struct dx_entry);
}
@@ -622,7 +590,7 @@ static inline unsigned dx_node_limit(struct inode *dir)
unsigned int entry_space = dir->i_sb->s_blocksize -
ext4_dir_rec_len(0, dir);
- if (ext4_has_metadata_csum(dir->i_sb))
+ if (ext4_has_feature_metadata_csum(dir->i_sb))
entry_space -= sizeof(struct dx_tail);
return entry_space / sizeof(struct dx_entry);
}
@@ -1076,7 +1044,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
struct ext4_dir_entry_2 *de, *top;
int err = 0, count = 0;
struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str;
- int csum = ext4_has_metadata_csum(dir->i_sb);
+ int csum = ext4_has_feature_metadata_csum(dir->i_sb);
dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
(unsigned long)block));
@@ -1320,7 +1288,7 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh,
struct dx_hash_info h = *hinfo;
int blocksize = EXT4_BLOCK_SIZE(dir->i_sb);
- if (ext4_has_metadata_csum(dir->i_sb))
+ if (ext4_has_feature_metadata_csum(dir->i_sb))
buflen -= sizeof(struct ext4_dir_entry_tail);
while ((char *) de < base + buflen) {
@@ -1462,7 +1430,8 @@ static bool ext4_match(struct inode *parent,
* sure cf_name was properly initialized before
* considering the calculated hash.
*/
- if (IS_ENCRYPTED(parent) && fname->cf_name.name &&
+ if (sb_no_casefold_compat_fallback(parent->i_sb) &&
+ IS_ENCRYPTED(parent) && fname->cf_name.name &&
(fname->hinfo.hash != EXT4_DIRENT_HASH(de) ||
fname->hinfo.minor_hash != EXT4_DIRENT_MINOR_HASH(de)))
return false;
@@ -1595,10 +1564,15 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
* return. Otherwise, fall back to doing a search the
* old fashioned way.
*/
- if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR)
+ if (IS_ERR(ret) && PTR_ERR(ret) == ERR_BAD_DX_DIR)
+ dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
+ "falling back\n"));
+ else if (!sb_no_casefold_compat_fallback(dir->i_sb) &&
+ *res_dir == NULL && IS_CASEFOLDED(dir))
+ dxtrace(printk(KERN_DEBUG "ext4_find_entry: casefold "
+ "failed, falling back\n"));
+ else
goto cleanup_and_exit;
- dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
- "falling back\n"));
ret = NULL;
}
nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
@@ -1945,7 +1919,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
int csum_size = 0;
int err = 0, i;
- if (ext4_has_metadata_csum(dir->i_sb))
+ if (ext4_has_feature_metadata_csum(dir->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
bh2 = ext4_append(handle, dir, &newblock);
@@ -1995,7 +1969,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
* split it in half by count; each resulting block will have at least
* half the space free.
*/
- if (i > 0)
+ if (i >= 0)
split = count - move;
else
split = count/2;
@@ -2060,8 +2034,7 @@ out:
return ERR_PTR(err);
}
-int ext4_find_dest_de(struct inode *dir, struct inode *inode,
- struct buffer_head *bh,
+int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh,
void *buf, int buf_size,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **dest_de)
@@ -2143,11 +2116,11 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
int csum_size = 0;
int err, err2;
- if (ext4_has_metadata_csum(inode->i_sb))
+ if (ext4_has_feature_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
if (!de) {
- err = ext4_find_dest_de(dir, inode, bh, bh->b_data,
+ err = ext4_find_dest_de(dir, bh, bh->b_data,
blocksize - csum_size, fname, &de);
if (err)
return err;
@@ -2252,7 +2225,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
struct fake_dirent *fde;
int csum_size = 0;
- if (ext4_has_metadata_csum(inode->i_sb))
+ if (ext4_has_feature_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
blocksize = dir->i_sb->s_blocksize;
@@ -2396,7 +2369,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
ext4_lblk_t block, blocks;
int csum_size = 0;
- if (ext4_has_metadata_csum(inode->i_sb))
+ if (ext4_has_feature_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
sb = dir->i_sb;
@@ -2427,7 +2400,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
if (!retval || (retval != ERR_BAD_DX_DIR))
goto out;
/* Can we just ignore htree data? */
- if (ext4_has_metadata_csum(sb)) {
+ if (ext4_has_feature_metadata_csum(sb)) {
EXT4_ERROR_INODE(dir,
"Directory has corrupted htree index.");
retval = -EFSCORRUPTED;
@@ -2577,8 +2550,10 @@ again:
BUFFER_TRACE(frame->bh, "get_write_access");
err = ext4_journal_get_write_access(handle, sb, frame->bh,
EXT4_JTR_NONE);
- if (err)
+ if (err) {
+ brelse(bh2);
goto journal_error;
+ }
if (!add_level) {
unsigned icount1 = icount/2, icount2 = icount - icount1;
unsigned hash2 = dx_get_hash(entries + icount1);
@@ -2589,8 +2564,10 @@ again:
err = ext4_journal_get_write_access(handle, sb,
(frame - 1)->bh,
EXT4_JTR_NONE);
- if (err)
+ if (err) {
+ brelse(bh2);
goto journal_error;
+ }
memcpy((char *) entries2, (char *) (entries + icount1),
icount2 * sizeof(struct dx_entry));
@@ -2609,8 +2586,10 @@ again:
dxtrace(dx_show_index("node",
((struct dx_node *) bh2->b_data)->entries));
err = ext4_handle_dirty_dx_node(handle, dir, bh2);
- if (err)
+ if (err) {
+ brelse(bh2);
goto journal_error;
+ }
brelse (bh2);
err = ext4_handle_dirty_dx_node(handle, dir,
(frame - 1)->bh);
@@ -2635,8 +2614,10 @@ again:
"Creating %d level index...\n",
dxroot->info.indirect_levels));
err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
- if (err)
+ if (err) {
+ brelse(bh2);
goto journal_error;
+ }
err = ext4_handle_dirty_dx_node(handle, dir, bh2);
brelse(bh2);
restart = 1;
@@ -2733,7 +2714,7 @@ static int ext4_delete_entry(handle_t *handle,
return err;
}
- if (ext4_has_metadata_csum(dir->i_sb))
+ if (ext4_has_feature_metadata_csum(dir->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
BUFFER_TRACE(bh, "get_write_access");
@@ -2934,48 +2915,59 @@ err_unlock_inode:
return err;
}
-struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
- struct ext4_dir_entry_2 *de,
- int blocksize, int csum_size,
- unsigned int parent_ino, int dotdot_real_len)
+int ext4_init_dirblock(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, unsigned int parent_ino,
+ void *inline_buf, int inline_size)
{
+ struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) bh->b_data;
+ size_t blocksize = bh->b_size;
+ int csum_size = 0, header_size;
+
+ if (ext4_has_feature_metadata_csum(inode->i_sb))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
+
de->inode = cpu_to_le32(inode->i_ino);
de->name_len = 1;
de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL),
blocksize);
- strcpy(de->name, ".");
+ memcpy(de->name, ".", 2);
ext4_set_de_type(inode->i_sb, de, S_IFDIR);
de = ext4_next_entry(de, blocksize);
de->inode = cpu_to_le32(parent_ino);
de->name_len = 2;
- if (!dotdot_real_len)
- de->rec_len = ext4_rec_len_to_disk(blocksize -
- (csum_size + ext4_dir_rec_len(1, NULL)),
- blocksize);
- else
+ memcpy(de->name, "..", 3);
+ ext4_set_de_type(inode->i_sb, de, S_IFDIR);
+ if (inline_buf) {
de->rec_len = ext4_rec_len_to_disk(
ext4_dir_rec_len(de->name_len, NULL),
blocksize);
- strcpy(de->name, "..");
- ext4_set_de_type(inode->i_sb, de, S_IFDIR);
+ de = ext4_next_entry(de, blocksize);
+ header_size = (char *)de - bh->b_data;
+ memcpy((void *)de, inline_buf, inline_size);
+ ext4_update_final_de(bh->b_data, inline_size + header_size,
+ blocksize - csum_size);
+ } else {
+ de->rec_len = ext4_rec_len_to_disk(blocksize -
+ (csum_size + ext4_dir_rec_len(1, NULL)),
+ blocksize);
+ }
- return ext4_next_entry(de, blocksize);
+ if (csum_size)
+ ext4_initialize_dirent_tail(bh, blocksize);
+ BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
+ set_buffer_uptodate(bh);
+ set_buffer_verified(bh);
+ return ext4_handle_dirty_dirblock(handle, inode, bh);
}
int ext4_init_new_dir(handle_t *handle, struct inode *dir,
struct inode *inode)
{
struct buffer_head *dir_block = NULL;
- struct ext4_dir_entry_2 *de;
ext4_lblk_t block = 0;
- unsigned int blocksize = dir->i_sb->s_blocksize;
- int csum_size = 0;
int err;
- if (ext4_has_metadata_csum(dir->i_sb))
- csum_size = sizeof(struct ext4_dir_entry_tail);
-
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
err = ext4_try_create_inline_dir(handle, dir, inode);
if (err < 0 && err != -ENOSPC)
@@ -2984,39 +2976,30 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir,
goto out;
}
+ set_nlink(inode, 2);
inode->i_size = 0;
dir_block = ext4_append(handle, inode, &block);
if (IS_ERR(dir_block))
return PTR_ERR(dir_block);
- de = (struct ext4_dir_entry_2 *)dir_block->b_data;
- ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
- set_nlink(inode, 2);
- if (csum_size)
- ext4_initialize_dirent_tail(dir_block, blocksize);
-
- BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_dirblock(handle, inode, dir_block);
- if (err)
- goto out;
- set_buffer_verified(dir_block);
+ err = ext4_init_dirblock(handle, inode, dir_block, dir->i_ino, NULL, 0);
out:
brelse(dir_block);
return err;
}
-static int ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
handle_t *handle;
struct inode *inode;
int err, err2 = 0, credits, retries = 0;
if (EXT4_DIR_LINK_MAX(dir))
- return -EMLINK;
+ return ERR_PTR(-EMLINK);
err = dquot_initialize(dir);
if (err)
- return err;
+ return ERR_PTR(err);
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
@@ -3066,7 +3049,7 @@ out_stop:
out_retry:
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
- return err;
+ return ERR_PTR(err);
}
/*
@@ -3101,7 +3084,8 @@ bool ext4_empty_dir(struct inode *inode)
de = (struct ext4_dir_entry_2 *) bh->b_data;
if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
0) ||
- le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) {
+ le32_to_cpu(de->inode) != inode->i_ino || de->name_len != 1 ||
+ de->name[0] != '.') {
ext4_warning_inode(inode, "directory missing '.'");
brelse(bh);
return false;
@@ -3110,7 +3094,8 @@ bool ext4_empty_dir(struct inode *inode)
de = ext4_next_entry(de, sb->s_blocksize);
if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
offset) ||
- le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
+ le32_to_cpu(de->inode) == 0 || de->name_len != 2 ||
+ de->name[0] != '.' || de->name[1] != '.') {
ext4_warning_inode(inode, "directory missing '..'");
brelse(bh);
return false;
@@ -3151,8 +3136,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
struct ext4_dir_entry_2 *de;
handle_t *handle = NULL;
- if (unlikely(ext4_forced_shutdown(dir->i_sb)))
- return -EIO;
+ retval = ext4_emergency_state(dir->i_sb);
+ if (unlikely(retval))
+ return retval;
/* Initialize quotas before so that eventual writes go in
* separate transaction */
@@ -3309,8 +3295,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
{
int retval;
- if (unlikely(ext4_forced_shutdown(dir->i_sb)))
- return -EIO;
+ retval = ext4_emergency_state(dir->i_sb);
+ if (unlikely(retval))
+ return retval;
trace_ext4_unlink_enter(dir, dentry);
/*
@@ -3376,8 +3363,9 @@ static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct fscrypt_str disk_link;
int retries = 0;
- if (unlikely(ext4_forced_shutdown(dir->i_sb)))
- return -EIO;
+ err = ext4_emergency_state(dir->i_sb);
+ if (unlikely(err))
+ return err;
err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize,
&disk_link);
@@ -3548,7 +3536,7 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data,
bh->b_size, 0) ||
le32_to_cpu(de->inode) != inode->i_ino ||
- strcmp(".", de->name)) {
+ de->name_len != 1 || de->name[0] != '.') {
EXT4_ERROR_INODE(inode, "directory missing '.'");
brelse(bh);
*retval = -EFSCORRUPTED;
@@ -3559,7 +3547,8 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
de = ext4_next_entry(de, inode->i_sb->s_blocksize);
if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data,
bh->b_size, offset) ||
- le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
+ le32_to_cpu(de->inode) == 0 || de->name_len != 2 ||
+ de->name[0] != '.' || de->name[1] != '.') {
EXT4_ERROR_INODE(inode, "directory missing '..'");
brelse(bh);
*retval = -EFSCORRUPTED;
@@ -4199,8 +4188,9 @@ static int ext4_rename2(struct mnt_idmap *idmap,
{
int err;
- if (unlikely(ext4_forced_shutdown(old_dir->i_sb)))
- return -EIO;
+ err = ext4_emergency_state(old_dir->i_sb);
+ if (unlikely(err))
+ return err;
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
index e5b47dda3317..524d4658fa40 100644
--- a/fs/ext4/orphan.c
+++ b/fs/ext4/orphan.c
@@ -537,13 +537,13 @@ static int ext4_orphan_file_block_csum_verify(struct super_block *sb,
struct ext4_orphan_block_tail *ot;
__le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr);
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return 1;
ot = ext4_orphan_block_tail(sb, bh);
- calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed,
- (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr));
- calculated = ext4_chksum(EXT4_SB(sb), calculated, (__u8 *)bh->b_data,
+ calculated = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr,
+ sizeof(dsk_block_nr));
+ calculated = ext4_chksum(calculated, (__u8 *)bh->b_data,
inodes_per_ob * sizeof(__u32));
return le32_to_cpu(ot->ob_checksum) == calculated;
}
@@ -560,10 +560,9 @@ void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers,
struct ext4_orphan_block_tail *ot;
__le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr);
- csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed,
- (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr));
- csum = ext4_chksum(EXT4_SB(sb), csum, (__u8 *)data,
- inodes_per_ob * sizeof(__u32));
+ csum = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr,
+ sizeof(dsk_block_nr));
+ csum = ext4_chksum(csum, (__u8 *)data, inodes_per_ob * sizeof(__u32));
ot = ext4_orphan_block_tail(sb, bh);
ot->ob_checksum = cpu_to_le32(csum);
}
@@ -590,8 +589,9 @@ int ext4_init_orphan_info(struct super_block *sb)
}
oi->of_blocks = inode->i_size >> sb->s_blocksize_bits;
oi->of_csum_seed = EXT4_I(inode)->i_csum_seed;
- oi->of_binfo = kmalloc(oi->of_blocks*sizeof(struct ext4_orphan_block),
- GFP_KERNEL);
+ oi->of_binfo = kmalloc_array(oi->of_blocks,
+ sizeof(struct ext4_orphan_block),
+ GFP_KERNEL);
if (!oi->of_binfo) {
ret = -ENOMEM;
goto out_put;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 69b8a7221a2b..39abfeec5f36 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -164,7 +164,8 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
}
/*
- * Check a range of space and convert unwritten extents to written. Note that
+ * On successful IO, check a range of space and convert unwritten extents to
+ * written. On IO failure, check if journal abort is needed. Note that
* we are protected from truncate touching same part of extent tree by the
* fact that truncate code waits for all DIO to finish (thus exclusion from
* direct IO is achieved) and also waits for PageWriteback bits. Thus we
@@ -175,20 +176,36 @@ static int ext4_end_io_end(ext4_io_end_t *io_end)
{
struct inode *inode = io_end->inode;
handle_t *handle = io_end->handle;
+ struct super_block *sb = inode->i_sb;
int ret = 0;
ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p,"
"list->prev 0x%p\n",
io_end, inode->i_ino, io_end->list.next, io_end->list.prev);
- io_end->handle = NULL; /* Following call will use up the handle */
- ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
- if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) {
- ext4_msg(inode->i_sb, KERN_EMERG,
+ /*
+ * Do not convert the unwritten extents if data writeback fails,
+ * or stale data may be exposed.
+ */
+ io_end->handle = NULL; /* Following call will use up the handle */
+ if (unlikely(io_end->flag & EXT4_IO_END_FAILED)) {
+ ret = -EIO;
+ if (handle)
+ jbd2_journal_free_reserved(handle);
+
+ if (test_opt(sb, DATA_ERR_ABORT))
+ jbd2_journal_abort(EXT4_SB(sb)->s_journal, ret);
+ } else {
+ ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
+ }
+ if (ret < 0 && !ext4_emergency_state(sb) &&
+ io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ ext4_msg(sb, KERN_EMERG,
"failed to convert unwritten extents to written "
"extents -- potential data loss! "
"(inode %lu, error %d)", inode->i_ino, ret);
}
+
ext4_clear_io_unwritten_flag(io_end);
ext4_release_io_end(io_end);
return ret;
@@ -217,6 +234,18 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head)
#endif
}
+static bool ext4_io_end_defer_completion(ext4_io_end_t *io_end)
+{
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN &&
+ !list_empty(&io_end->list_vec))
+ return true;
+ if (test_opt(io_end->inode->i_sb, DATA_ERR_ABORT) &&
+ io_end->flag & EXT4_IO_END_FAILED &&
+ !ext4_emergency_state(io_end->inode->i_sb))
+ return true;
+ return false;
+}
+
/* Add the io_end to per-inode completed end_io list. */
static void ext4_add_complete_io(ext4_io_end_t *io_end)
{
@@ -225,9 +254,12 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
struct workqueue_struct *wq;
unsigned long flags;
- /* Only reserved conversions from writeback should enter here */
- WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
- WARN_ON(!io_end->handle && sbi->s_journal);
+ /* Only reserved conversions or pending IO errors will enter here. */
+ WARN_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION));
+ WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN &&
+ !io_end->handle && sbi->s_journal);
+ WARN_ON(!io_end->bio);
+
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
wq = sbi->rsv_conversion_wq;
if (list_empty(&ei->i_rsv_conversion_list))
@@ -252,7 +284,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
while (!list_empty(&unwritten)) {
io_end = list_entry(unwritten.next, ext4_io_end_t, list);
- BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+ BUG_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION));
list_del_init(&io_end->list);
err = ext4_end_io_end(io_end);
@@ -263,7 +295,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
}
/*
- * work on completed IO, to convert unwritten extents to extents
+ * Used to convert unwritten extents to written extents upon IO completion,
+ * or used to abort the journal upon IO errors.
*/
void ext4_end_io_rsv_work(struct work_struct *work)
{
@@ -288,29 +321,22 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
if (refcount_dec_and_test(&io_end->count)) {
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
- list_empty(&io_end->list_vec)) {
- ext4_release_io_end(io_end);
- return;
- }
- ext4_add_complete_io(io_end);
+ if (ext4_io_end_defer_completion(io_end))
+ return ext4_add_complete_io(io_end);
+
+ ext4_release_io_end(io_end);
}
}
int ext4_put_io_end(ext4_io_end_t *io_end)
{
- int err = 0;
-
if (refcount_dec_and_test(&io_end->count)) {
- if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
- err = ext4_convert_unwritten_io_end_vec(io_end->handle,
- io_end);
- io_end->handle = NULL;
- ext4_clear_io_unwritten_flag(io_end);
- }
+ if (ext4_io_end_defer_completion(io_end))
+ return ext4_end_io_end(io_end);
+
ext4_release_io_end(io_end);
}
- return err;
+ return 0;
}
ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
@@ -344,11 +370,12 @@ static void ext4_end_bio(struct bio *bio)
bio->bi_status, inode->i_ino,
(unsigned long long)
bi_sector >> (inode->i_blkbits - 9));
+ io_end->flag |= EXT4_IO_END_FAILED;
mapping_set_error(inode->i_mapping,
blk_status_to_errno(bio->bi_status));
}
- if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ if (ext4_io_end_defer_completion(io_end)) {
/*
* Link bio into list hanging from io_end. We have to do it
* atomically as bio completions can be racing against each
@@ -520,9 +547,9 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
* first page of the bio. Otherwise it can deadlock.
*/
if (io->io_bio)
- gfp_flags = GFP_NOWAIT | __GFP_NOWARN;
+ gfp_flags = GFP_NOWAIT;
retry_encrypt:
- bounce_page = fscrypt_encrypt_pagecache_blocks(&folio->page,
+ bounce_page = fscrypt_encrypt_pagecache_blocks(folio,
enc_bytes, 0, gfp_flags);
if (IS_ERR(bounce_page)) {
ret = PTR_ERR(bounce_page);
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 5d3a9dc9a32d..f329daf6e5c7 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -227,24 +227,30 @@ int ext4_mpage_readpages(struct inode *inode,
int length;
unsigned relative_block = 0;
struct ext4_map_blocks map;
- unsigned int nr_pages = rac ? readahead_count(rac) : 1;
+ unsigned int nr_pages, folio_pages;
map.m_pblk = 0;
map.m_lblk = 0;
map.m_len = 0;
map.m_flags = 0;
- for (; nr_pages; nr_pages--) {
+ nr_pages = rac ? readahead_count(rac) : folio_nr_pages(folio);
+ for (; nr_pages; nr_pages -= folio_pages) {
int fully_mapped = 1;
- unsigned first_hole = blocks_per_page;
+ unsigned int first_hole;
+ unsigned int blocks_per_folio;
if (rac)
folio = readahead_folio(rac);
+
+ folio_pages = folio_nr_pages(folio);
prefetchw(&folio->flags);
if (folio_buffers(folio))
goto confused;
+ blocks_per_folio = folio_size(folio) >> blkbits;
+ first_hole = blocks_per_folio;
block_in_file = next_block =
(sector_t)folio->index << (PAGE_SHIFT - blkbits);
last_block = block_in_file + nr_pages * blocks_per_page;
@@ -270,7 +276,7 @@ int ext4_mpage_readpages(struct inode *inode,
map.m_flags &= ~EXT4_MAP_MAPPED;
break;
}
- if (page_block == blocks_per_page)
+ if (page_block == blocks_per_folio)
break;
page_block++;
block_in_file++;
@@ -281,7 +287,7 @@ int ext4_mpage_readpages(struct inode *inode,
* Then do more ext4_map_blocks() calls until we are
* done with this folio.
*/
- while (page_block < blocks_per_page) {
+ while (page_block < blocks_per_folio) {
if (block_in_file < last_block) {
map.m_lblk = block_in_file;
map.m_len = last_block - block_in_file;
@@ -296,13 +302,13 @@ int ext4_mpage_readpages(struct inode *inode,
}
if ((map.m_flags & EXT4_MAP_MAPPED) == 0) {
fully_mapped = 0;
- if (first_hole == blocks_per_page)
+ if (first_hole == blocks_per_folio)
first_hole = page_block;
page_block++;
block_in_file++;
continue;
}
- if (first_hole != blocks_per_page)
+ if (first_hole != blocks_per_folio)
goto confused; /* hole -> non-hole */
/* Contiguous blocks? */
@@ -315,13 +321,13 @@ int ext4_mpage_readpages(struct inode *inode,
/* needed? */
map.m_flags &= ~EXT4_MAP_MAPPED;
break;
- } else if (page_block == blocks_per_page)
+ } else if (page_block == blocks_per_folio)
break;
page_block++;
block_in_file++;
}
}
- if (first_hole != blocks_per_page) {
+ if (first_hole != blocks_per_folio) {
folio_zero_segment(folio, first_hole << blkbits,
folio_size(folio));
if (first_hole == 0) {
@@ -367,11 +373,11 @@ int ext4_mpage_readpages(struct inode *inode,
if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
(relative_block == map.m_len)) ||
- (first_hole != blocks_per_page)) {
+ (first_hole != blocks_per_folio)) {
submit_bio(bio);
bio = NULL;
} else
- last_block_in_bio = first_block + blocks_per_page - 1;
+ last_block_in_bio = first_block + blocks_per_folio - 1;
continue;
confused:
if (bio) {
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 72f77f78ae8d..050f26168d97 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1118,8 +1118,8 @@ static inline void ext4_set_block_group_nr(struct super_block *sb, char *data,
struct ext4_super_block *es = (struct ext4_super_block *) data;
es->s_block_group_nr = cpu_to_le16(group);
- if (ext4_has_metadata_csum(sb))
- es->s_checksum = ext4_superblock_csum(sb, es);
+ if (ext4_has_feature_metadata_csum(sb))
+ es->s_checksum = ext4_superblock_csum(es);
}
/*
@@ -1315,7 +1315,7 @@ static int ext4_set_bitmap_checksums(struct super_block *sb,
{
struct buffer_head *bh;
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return 0;
bh = ext4_get_bitmap(sb, group_data->inode_bitmap);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a50e5c31b937..699c15db28a8 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -79,7 +79,6 @@ static int ext4_unfreeze(struct super_block *sb);
static int ext4_freeze(struct super_block *sb);
static inline int ext2_feature_set_ok(struct super_block *sb);
static inline int ext3_feature_set_ok(struct super_block *sb);
-static void ext4_destroy_lazyinit_thread(void);
static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void);
static struct inode *ext4_get_journal_inode(struct super_block *sb,
@@ -269,7 +268,7 @@ struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
{
struct buffer_head *bh = bdev_getblk(sb->s_bdev, block,
- sb->s_blocksize, GFP_NOWAIT | __GFP_NOWARN);
+ sb->s_blocksize, GFP_NOWAIT);
if (likely(bh)) {
if (trylock_buffer(bh))
@@ -287,14 +286,12 @@ static int ext4_verify_csum_type(struct super_block *sb,
return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
}
-__le32 ext4_superblock_csum(struct super_block *sb,
- struct ext4_super_block *es)
+__le32 ext4_superblock_csum(struct ext4_super_block *es)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
int offset = offsetof(struct ext4_super_block, s_checksum);
__u32 csum;
- csum = ext4_chksum(sbi, ~0, (char *)es, offset);
+ csum = ext4_chksum(~0, (char *)es, offset);
return cpu_to_le32(csum);
}
@@ -302,20 +299,20 @@ __le32 ext4_superblock_csum(struct super_block *sb,
static int ext4_superblock_csum_verify(struct super_block *sb,
struct ext4_super_block *es)
{
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return 1;
- return es->s_checksum == ext4_superblock_csum(sb, es);
+ return es->s_checksum == ext4_superblock_csum(es);
}
void ext4_superblock_csum_set(struct super_block *sb)
{
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return;
- es->s_checksum = ext4_superblock_csum(sb, es);
+ es->s_checksum = ext4_superblock_csum(es);
}
ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
@@ -448,9 +445,6 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
#define ext4_get_tstamp(es, tstamp) \
__ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
-#define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */
-#define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */
-
/*
* The ext4_maybe_update_superblock() function checks and updates the
* superblock if needed.
@@ -458,8 +452,10 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
* This function is designed to update the on-disk superblock only under
* certain conditions to prevent excessive disk writes and unnecessary
* waking of the disk from sleep. The superblock will be updated if:
- * 1. More than an hour has passed since the last superblock update, and
- * 2. More than 16MB have been written since the last superblock update.
+ * 1. More than sbi->s_sb_update_sec (def: 1 hour) has passed since the last
+ * superblock update
+ * 2. More than sbi->s_sb_update_kb (def: 16MB) kbs have been written since the
+ * last superblock update.
*
* @sb: The superblock
*/
@@ -473,14 +469,15 @@ static void ext4_maybe_update_superblock(struct super_block *sb)
__u64 lifetime_write_kbytes;
__u64 diff_size;
- if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) ||
- !journal || (journal->j_flags & JBD2_UNMOUNT))
+ if (ext4_emergency_state(sb) || sb_rdonly(sb) ||
+ !(sb->s_flags & SB_ACTIVE) || !journal ||
+ journal->j_flags & JBD2_UNMOUNT)
return;
now = ktime_get_real_seconds();
last_update = ext4_get_tstamp(es, s_wtime);
- if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC))
+ if (likely(now - last_update < sbi->s_sb_update_sec))
return;
lifetime_write_kbytes = sbi->s_kbytes_written +
@@ -495,49 +492,23 @@ static void ext4_maybe_update_superblock(struct super_block *sb)
*/
diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written);
- if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB)
+ if (diff_size > sbi->s_sb_update_kb)
schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
}
static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
{
struct super_block *sb = journal->j_private;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- int error = is_journal_aborted(journal);
- struct ext4_journal_cb_entry *jce;
BUG_ON(txn->t_state == T_FINISHED);
ext4_process_freed_data(sb, txn->t_tid);
ext4_maybe_update_superblock(sb);
-
- spin_lock(&sbi->s_md_lock);
- while (!list_empty(&txn->t_private_list)) {
- jce = list_entry(txn->t_private_list.next,
- struct ext4_journal_cb_entry, jce_list);
- list_del_init(&jce->jce_list);
- spin_unlock(&sbi->s_md_lock);
- jce->jce_func(sb, jce, error);
- spin_lock(&sbi->s_md_lock);
- }
- spin_unlock(&sbi->s_md_lock);
}
-/*
- * This writepage callback for write_cache_pages()
- * takes care of a few cases after page cleaning.
- *
- * write_cache_pages() already checks for dirty pages
- * and calls clear_page_dirty_for_io(), which we want,
- * to write protect the pages.
- *
- * However, we may have to redirty a page (see below.)
- */
-static int ext4_journalled_writepage_callback(struct folio *folio,
- struct writeback_control *wbc,
- void *data)
+static bool ext4_journalled_writepage_needs_redirty(struct jbd2_inode *jinode,
+ struct folio *folio)
{
- transaction_t *transaction = (transaction_t *) data;
struct buffer_head *bh, *head;
struct journal_head *jh;
@@ -558,15 +529,12 @@ static int ext4_journalled_writepage_callback(struct folio *folio,
*/
jh = bh2jh(bh);
if (buffer_dirty(bh) ||
- (jh && (jh->b_transaction != transaction ||
- jh->b_next_transaction))) {
- folio_redirty_for_writepage(wbc, folio);
- goto out;
- }
+ (jh && (jh->b_transaction != jinode->i_transaction ||
+ jh->b_next_transaction)))
+ return true;
} while ((bh = bh->b_this_page) != head);
-out:
- return AOP_WRITEPAGE_ACTIVATE;
+ return false;
}
static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
@@ -578,10 +546,23 @@ static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
.range_start = jinode->i_dirty_start,
.range_end = jinode->i_dirty_end,
};
+ struct folio *folio = NULL;
+ int error;
- return write_cache_pages(mapping, &wbc,
- ext4_journalled_writepage_callback,
- jinode->i_transaction);
+ /*
+ * writeback_iter() already checks for dirty pages and calls
+ * folio_clear_dirty_for_io(), which we want to write protect the
+ * folios.
+ *
+ * However, we may have to redirty a folio sometimes.
+ */
+ while ((folio = writeback_iter(mapping, &wbc, folio, &error))) {
+ if (ext4_journalled_writepage_needs_redirty(jinode, folio))
+ folio_redirty_for_writepage(&wbc, folio);
+ folio_unlock(folio);
+ }
+
+ return error;
}
static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
@@ -707,11 +688,8 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
if (test_opt(sb, WARN_ON_ERROR))
WARN_ON_ONCE(1);
- if (!continue_fs && !sb_rdonly(sb)) {
- set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags);
- if (journal)
- jbd2_journal_abort(journal, -EIO);
- }
+ if (!continue_fs && !ext4_emergency_ro(sb) && journal)
+ jbd2_journal_abort(journal, -EIO);
if (!bdev_read_only(sb->s_bdev)) {
save_error_info(sb, error, ino, block, func, line);
@@ -719,9 +697,13 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
* In case the fs should keep running, we need to writeout
* superblock through the journal. Due to lock ordering
* constraints, it may not be safe to do it right here so we
- * defer superblock flushing to a workqueue.
+ * defer superblock flushing to a workqueue. We just need to be
+ * careful when the journal is already shutting down. If we get
+ * here in that case, just update the sb directly as the last
+ * transaction won't commit anyway.
*/
- if (continue_fs && journal)
+ if (continue_fs && journal &&
+ !ext4_test_mount_flag(sb, EXT4_MF_JOURNAL_DESTROY))
schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
else
ext4_commit_super(sb);
@@ -737,17 +719,17 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
sb->s_id);
}
- if (sb_rdonly(sb) || continue_fs)
+ if (ext4_emergency_ro(sb) || continue_fs)
return;
ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
/*
- * EXT4_FLAGS_SHUTDOWN was set which stops all filesystem
- * modifications. We don't set SB_RDONLY because that requires
- * sb->s_umount semaphore and setting it without proper remount
- * procedure is confusing code such as freeze_super() leading to
- * deadlocks and other problems.
+ * We don't set SB_RDONLY because that requires sb->s_umount
+ * semaphore and setting it without proper remount procedure is
+ * confusing code such as freeze_super() leading to deadlocks
+ * and other problems.
*/
+ set_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags);
}
static void update_super_work(struct work_struct *work)
@@ -765,7 +747,8 @@ static void update_super_work(struct work_struct *work)
* We use directly jbd2 functions here to avoid recursing back into
* ext4 error handling code during handling of previous errors.
*/
- if (!sb_rdonly(sbi->s_sb) && journal) {
+ if (!ext4_emergency_state(sbi->s_sb) &&
+ !sb_rdonly(sbi->s_sb) && journal) {
struct buffer_head *sbh = sbi->s_sbh;
bool call_notify_err = false;
@@ -819,7 +802,7 @@ void __ext4_error(struct super_block *sb, const char *function,
struct va_format vaf;
va_list args;
- if (unlikely(ext4_forced_shutdown(sb)))
+ if (unlikely(ext4_emergency_state(sb)))
return;
trace_ext4_error(sb, function, line);
@@ -844,7 +827,7 @@ void __ext4_error_inode(struct inode *inode, const char *function,
va_list args;
struct va_format vaf;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
+ if (unlikely(ext4_emergency_state(inode->i_sb)))
return;
trace_ext4_error(inode->i_sb, function, line);
@@ -879,7 +862,7 @@ void __ext4_error_file(struct file *file, const char *function,
struct inode *inode = file_inode(file);
char pathname[80], *path;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
+ if (unlikely(ext4_emergency_state(inode->i_sb)))
return;
trace_ext4_error(inode->i_sb, function, line);
@@ -959,7 +942,7 @@ void __ext4_std_error(struct super_block *sb, const char *function,
char nbuf[16];
const char *errstr;
- if (unlikely(ext4_forced_shutdown(sb)))
+ if (unlikely(ext4_emergency_state(sb)))
return;
/* Special case: if the error is EROFS, and we're not already
@@ -1053,7 +1036,7 @@ __acquires(bitlock)
struct va_format vaf;
va_list args;
- if (unlikely(ext4_forced_shutdown(sb)))
+ if (unlikely(ext4_emergency_state(sb)))
return;
trace_ext4_error(sb, function, line);
@@ -1306,18 +1289,17 @@ static void ext4_put_super(struct super_block *sb)
ext4_unregister_li_request(sb);
ext4_quotas_off(sb, EXT4_MAXQUOTAS);
- flush_work(&sbi->s_sb_upd_work);
destroy_workqueue(sbi->rsv_conversion_wq);
ext4_release_orphan_info(sb);
if (sbi->s_journal) {
aborted = is_journal_aborted(sbi->s_journal);
- err = jbd2_journal_destroy(sbi->s_journal);
- sbi->s_journal = NULL;
+ err = ext4_journal_destroy(sbi, sbi->s_journal);
if ((err < 0) && !aborted) {
ext4_abort(sb, -err, "Couldn't clean up the journal");
}
- }
+ } else
+ flush_work(&sbi->s_sb_upd_work);
ext4_es_unregister_shrinker(sbi);
timer_shutdown_sync(&sbi->s_err_report);
@@ -1325,13 +1307,14 @@ static void ext4_put_super(struct super_block *sb)
ext4_mb_release(sb);
ext4_ext_release(sb);
- if (!sb_rdonly(sb) && !aborted) {
- ext4_clear_feature_journal_needs_recovery(sb);
- ext4_clear_feature_orphan_present(sb);
- es->s_state = cpu_to_le16(sbi->s_mount_state);
- }
- if (!sb_rdonly(sb))
+ if (!ext4_emergency_state(sb) && !sb_rdonly(sb)) {
+ if (!aborted) {
+ ext4_clear_feature_journal_needs_recovery(sb);
+ ext4_clear_feature_orphan_present(sb);
+ es->s_state = cpu_to_le16(sbi->s_mount_state);
+ }
ext4_commit_super(sb);
+ }
ext4_group_desc_free(sbi);
ext4_flex_groups_free(sbi);
@@ -1426,10 +1409,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
spin_lock_init(&ei->i_completed_io_lock);
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
- atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
ext4_fc_init_inode(&ei->vfs_inode);
- mutex_init(&ei->i_fc_lock);
+ spin_lock_init(&ei->i_fc_lock);
return &ei->vfs_inode;
}
@@ -1823,7 +1805,6 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
{}
};
-#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
#define MOPT_SET 0x0001
#define MOPT_CLEAR 0x0002
@@ -2017,6 +1998,9 @@ int ext4_init_fs_context(struct fs_context *fc)
fc->fs_private = ctx;
fc->ops = &ext4_context_ops;
+ /* i_version is always enabled now */
+ fc->sb_flags |= SB_I_VERSION;
+
return 0;
}
@@ -2785,6 +2769,13 @@ static int ext4_check_opt_consistency(struct fs_context *fc,
}
if (is_remount) {
+ if (!sbi->s_journal &&
+ ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT)) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Remounting fs w/o journal so ignoring data_err option");
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT);
+ }
+
if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) &&
(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
ext4_msg(NULL, KERN_ERR, "can't mount with "
@@ -2987,6 +2978,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
+ if (nodefs && sb->s_flags & SB_I_VERSION)
+ SEQ_OPTS_PUTS("i_version");
if (nodefs || sbi->s_stripe)
SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
if (nodefs || EXT4_MOUNT_DATA_FLAGS &
@@ -3038,6 +3031,12 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
if (nodefs && !test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS))
SEQ_OPTS_PUTS("prefetch_block_bitmaps");
+ if (ext4_emergency_ro(sb))
+ SEQ_OPTS_PUTS("emergency_ro");
+
+ if (ext4_forced_shutdown(sb))
+ SEQ_OPTS_PUTS("shutdown");
+
ext4_show_quota_options(seq, sb);
return 0;
}
@@ -3205,19 +3204,19 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
__le32 le_group = cpu_to_le32(block_group);
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (ext4_has_metadata_csum(sbi->s_sb)) {
+ if (ext4_has_feature_metadata_csum(sbi->s_sb)) {
/* Use new metadata_csum algorithm */
__u32 csum32;
__u16 dummy_csum = 0;
- csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
+ csum32 = ext4_chksum(sbi->s_csum_seed, (__u8 *)&le_group,
sizeof(le_group));
- csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset);
- csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum,
+ csum32 = ext4_chksum(csum32, (__u8 *)gdp, offset);
+ csum32 = ext4_chksum(csum32, (__u8 *)&dummy_csum,
sizeof(dummy_csum));
offset += sizeof(dummy_csum);
if (offset < sbi->s_desc_size)
- csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset,
+ csum32 = ext4_chksum(csum32, (__u8 *)gdp + offset,
sbi->s_desc_size - offset);
crc = csum32 & 0xFFFF;
@@ -3633,7 +3632,7 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly)
*/
static void print_daily_error_info(struct timer_list *t)
{
- struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report);
+ struct ext4_sb_info *sbi = timer_container_of(sbi, t, s_err_report);
struct super_block *sb = sbi->s_sb;
struct ext4_super_block *es = sbi->s_es;
@@ -3693,7 +3692,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
if (group >= elr->lr_next_group) {
ret = 1;
if (elr->lr_first_not_zeroed != ngroups &&
- !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) {
+ !ext4_emergency_state(sb) && !sb_rdonly(sb) &&
+ test_opt(sb, INIT_INODE_TABLE)) {
elr->lr_next_group = elr->lr_first_not_zeroed;
elr->lr_mode = EXT4_LI_MODE_ITABLE;
ret = 0;
@@ -3998,7 +3998,7 @@ int ext4_register_li_request(struct super_block *sb,
goto out;
}
- if (sb_rdonly(sb) ||
+ if (ext4_emergency_state(sb) || sb_rdonly(sb) ||
(test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) &&
(first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE))))
goto out;
@@ -4061,7 +4061,7 @@ static int set_journal_csum_feature_set(struct super_block *sb)
int compat, incompat;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (ext4_has_metadata_csum(sb)) {
+ if (ext4_has_feature_metadata_csum(sb)) {
/* journal checksum v3 */
compat = 0;
incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
@@ -4349,7 +4349,7 @@ static void ext4_set_def_opts(struct super_block *sb,
if (ext4_has_feature_fast_commit(sb))
set_opt2(sb, JOURNAL_FAST_COMMIT);
/* don't forget to enable journal_csum when metadata_csum is enabled. */
- if (ext4_has_metadata_csum(sb))
+ if (ext4_has_feature_metadata_csum(sb))
set_opt(sb, JOURNAL_CHECKSUM);
if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
@@ -4441,13 +4441,16 @@ static int ext4_handle_clustersize(struct super_block *sb)
/*
* ext4_atomic_write_init: Initializes filesystem min & max atomic write units.
+ * With non-bigalloc filesystem awu will be based upon filesystem blocksize
+ * & bdev awu units.
+ * With bigalloc it will be based upon bigalloc cluster size & bdev awu units.
* @sb: super block
- * TODO: Later add support for bigalloc
*/
static void ext4_atomic_write_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct block_device *bdev = sb->s_bdev;
+ unsigned int clustersize = EXT4_CLUSTER_SIZE(sb);
if (!bdev_can_atomic_write(bdev))
return;
@@ -4457,7 +4460,7 @@ static void ext4_atomic_write_init(struct super_block *sb)
sbi->s_awu_min = max(sb->s_blocksize,
bdev_atomic_write_unit_min_bytes(bdev));
- sbi->s_awu_max = min(sb->s_blocksize,
+ sbi->s_awu_max = min(clustersize,
bdev_atomic_write_unit_max_bytes(bdev));
if (sbi->s_awu_min && sbi->s_awu_max &&
sbi->s_awu_min <= sbi->s_awu_max) {
@@ -4482,7 +4485,7 @@ static void ext4_fast_commit_init(struct super_block *sb)
sbi->s_fc_bytes = 0;
ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
sbi->s_fc_ineligible_tid = 0;
- spin_lock_init(&sbi->s_fc_lock);
+ mutex_init(&sbi->s_fc_lock);
memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
sbi->s_fc_replay_state.fc_regions = NULL;
sbi->s_fc_replay_state.fc_regions_size = 0;
@@ -4642,8 +4645,9 @@ static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_blo
/* Precompute checksum seed for all metadata */
if (ext4_has_feature_csum_seed(sb))
sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
- else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
- sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
+ else if (ext4_has_feature_metadata_csum(sb) ||
+ ext4_has_feature_ea_inode(sb))
+ sbi->s_csum_seed = ext4_chksum(~0, es->s_uuid,
sizeof(es->s_uuid));
return 0;
}
@@ -4973,10 +4977,7 @@ static int ext4_load_and_init_journal(struct super_block *sb,
return 0;
out:
- /* flush s_sb_upd_work before destroying the journal. */
- flush_work(&sbi->s_sb_upd_work);
- jbd2_journal_destroy(sbi->s_journal);
- sbi->s_journal = NULL;
+ ext4_journal_destroy(sbi, sbi->s_journal);
return -EINVAL;
}
@@ -5013,6 +5014,24 @@ static int ext4_check_journal_data_mode(struct super_block *sb)
return 0;
}
+static const char *ext4_has_journal_option(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
+ return "journal_async_commit";
+ if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM))
+ return "journal_checksum";
+ if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
+ return "commit=";
+ if (EXT4_MOUNT_DATA_FLAGS &
+ (sbi->s_mount_opt ^ sbi->s_def_mount_opt))
+ return "data=";
+ if (test_opt(sb, DATA_ERR_ABORT))
+ return "data_err=abort";
+ return NULL;
+}
+
static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb,
int silent)
{
@@ -5239,7 +5258,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
/* Set defaults for the variables that will be set during parsing */
if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO))
- ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+ ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
sbi->s_sectors_written_start =
@@ -5263,6 +5282,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
+ sbi->s_sb_update_kb = EXT4_DEF_SB_UPDATE_INTERVAL_KB;
+ sbi->s_sb_update_sec = EXT4_DEF_SB_UPDATE_INTERVAL_SEC;
/*
* set default s_li_wait_mult for lazyinit, for the case there is
@@ -5298,9 +5319,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
- /* i_version is always enabled now */
- sb->s_flags |= SB_I_VERSION;
-
/* HSM events are allowed by default. */
sb->s_iflags |= SB_I_ALLOW_HSM;
@@ -5398,36 +5416,25 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
err = ext4_load_and_init_journal(sb, es, ctx);
if (err)
goto failed_mount3a;
+ if (bdev_read_only(sb->s_bdev))
+ needs_recovery = 0;
} else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) &&
ext4_has_feature_journal_needs_recovery(sb)) {
ext4_msg(sb, KERN_ERR, "required journal recovery "
"suppressed and not mounted read-only");
goto failed_mount3a;
} else {
+ const char *journal_option;
+
/* Nojournal mode, all journal mount options are illegal */
- if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "journal_async_commit, fs mounted w/o journal");
+ journal_option = ext4_has_journal_option(sb);
+ if (journal_option != NULL) {
+ ext4_msg(sb, KERN_ERR,
+ "can't mount with %s, fs mounted w/o journal",
+ journal_option);
goto failed_mount3a;
}
- if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "journal_checksum, fs mounted w/o journal");
- goto failed_mount3a;
- }
- if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "commit=%lu, fs mounted w/o journal",
- sbi->s_commit_interval / HZ);
- goto failed_mount3a;
- }
- if (EXT4_MOUNT_DATA_FLAGS &
- (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "data=, fs mounted w/o journal");
- goto failed_mount3a;
- }
sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
clear_opt(sb, JOURNAL_CHECKSUM);
clear_opt(sb, DATA_FLAGS);
@@ -5616,9 +5623,11 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
goto failed_mount9;
}
- if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev))
+ if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) {
ext4_msg(sb, KERN_WARNING,
"mounting with \"discard\" option, but the device does not support discard");
+ clear_opt(sb, DISCARD);
+ }
if (es->s_error_count)
mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
@@ -5665,10 +5674,7 @@ failed_mount_wq:
sbi->s_ea_block_cache = NULL;
if (sbi->s_journal) {
- /* flush s_sb_upd_work before journal destroy. */
- flush_work(&sbi->s_sb_upd_work);
- jbd2_journal_destroy(sbi->s_journal);
- sbi->s_journal = NULL;
+ ext4_journal_destroy(sbi, sbi->s_journal);
}
failed_mount3a:
ext4_es_unregister_shrinker(sbi);
@@ -5676,7 +5682,7 @@ failed_mount3:
/* flush s_sb_upd_work before sbi destroy */
flush_work(&sbi->s_sb_upd_work);
ext4_stop_mmpd(sbi);
- del_timer_sync(&sbi->s_err_report);
+ timer_delete_sync(&sbi->s_err_report);
ext4_group_desc_free(sbi);
failed_mount:
#if IS_ENABLED(CONFIG_UNICODE)
@@ -5773,10 +5779,6 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
journal->j_flags |= JBD2_BARRIER;
else
journal->j_flags &= ~JBD2_BARRIER;
- if (test_opt(sb, DATA_ERR_ABORT))
- journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
- else
- journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
/*
* Always enable journal cycle record option, letting the journal
* records log transactions continuously between each mount.
@@ -5916,7 +5918,7 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb,
if ((le32_to_cpu(es->s_feature_ro_compat) &
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
- es->s_checksum != ext4_superblock_csum(sb, es)) {
+ es->s_checksum != ext4_superblock_csum(es)) {
ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock");
errno = -EFSCORRUPTED;
goto out_bh;
@@ -5973,7 +5975,7 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb,
return journal;
out_journal:
- jbd2_journal_destroy(journal);
+ ext4_journal_destroy(EXT4_SB(sb), journal);
out_bdev:
bdev_fput(bdev_file);
return ERR_PTR(errno);
@@ -6090,8 +6092,7 @@ static int ext4_load_journal(struct super_block *sb,
EXT4_SB(sb)->s_journal = journal;
err = ext4_clear_journal_err(sb, es);
if (err) {
- EXT4_SB(sb)->s_journal = NULL;
- jbd2_journal_destroy(journal);
+ ext4_journal_destroy(EXT4_SB(sb), journal);
return err;
}
@@ -6109,7 +6110,7 @@ static int ext4_load_journal(struct super_block *sb,
return 0;
err_out:
- jbd2_journal_destroy(journal);
+ ext4_journal_destroy(EXT4_SB(sb), journal);
return err;
}
@@ -6336,8 +6337,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
bool needs_barrier = false;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (unlikely(ext4_forced_shutdown(sb)))
- return -EIO;
+ ret = ext4_emergency_state(sb);
+ if (unlikely(ret))
+ return ret;
trace_ext4_sync_fs(sb, wait);
flush_workqueue(sbi->rsv_conversion_wq);
@@ -6419,7 +6421,7 @@ out:
*/
static int ext4_unfreeze(struct super_block *sb)
{
- if (ext4_forced_shutdown(sb))
+ if (ext4_emergency_state(sb))
return 0;
if (EXT4_SB(sb)->s_journal) {
@@ -6495,7 +6497,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
ctx->journal_ioprio =
sbi->s_journal->j_task->io_context->ioprio;
else
- ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+ ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
}
@@ -6575,7 +6577,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
flush_work(&sbi->s_sb_upd_work);
if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) {
- if (ext4_forced_shutdown(sb)) {
+ if (ext4_emergency_state(sb)) {
err = -EROFS;
goto restore_opts;
}
@@ -6780,6 +6782,7 @@ static int ext4_reconfigure(struct fs_context *fc)
{
struct super_block *sb = fc->root->d_sb;
int ret;
+ bool old_ro = sb_rdonly(sb);
fc->s_fs_info = EXT4_SB(sb);
@@ -6791,9 +6794,9 @@ static int ext4_reconfigure(struct fs_context *fc)
if (ret < 0)
return ret;
- ext4_msg(sb, KERN_INFO, "re-mounted %pU %s. Quota mode: %s.",
- &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w",
- ext4_quota_mode(sb));
+ ext4_msg(sb, KERN_INFO, "re-mounted %pU%s.",
+ &sb->s_uuid,
+ (old_ro != sb_rdonly(sb)) ? (sb_rdonly(sb) ? " ro" : " r/w") : "");
return 0;
}
@@ -6817,22 +6820,29 @@ static int ext4_statfs_project(struct super_block *sb,
dquot->dq_dqb.dqb_bhardlimit);
limit >>= sb->s_blocksize_bits;
- if (limit && buf->f_blocks > limit) {
+ if (limit) {
+ uint64_t remaining = 0;
+
curblock = (dquot->dq_dqb.dqb_curspace +
dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
- buf->f_blocks = limit;
- buf->f_bfree = buf->f_bavail =
- (buf->f_blocks > curblock) ?
- (buf->f_blocks - curblock) : 0;
+ if (limit > curblock)
+ remaining = limit - curblock;
+
+ buf->f_blocks = min(buf->f_blocks, limit);
+ buf->f_bfree = min(buf->f_bfree, remaining);
+ buf->f_bavail = min(buf->f_bavail, remaining);
}
limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
dquot->dq_dqb.dqb_ihardlimit);
- if (limit && buf->f_files > limit) {
- buf->f_files = limit;
- buf->f_ffree =
- (buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
- (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
+ if (limit) {
+ uint64_t remaining = 0;
+
+ if (limit > dquot->dq_dqb.dqb_curinodes)
+ remaining = limit - dquot->dq_dqb.dqb_curinodes;
+
+ buf->f_files = min(buf->f_files, limit);
+ buf->f_ffree = min(buf->f_ffree, remaining);
}
spin_unlock(&dquot->dq_dqb_lock);
@@ -6935,12 +6945,25 @@ static int ext4_release_dquot(struct dquot *dquot)
{
int ret, err;
handle_t *handle;
+ bool freeze_protected = false;
+
+ /*
+ * Trying to sb_start_intwrite() in a running transaction
+ * can result in a deadlock. Further, running transactions
+ * are already protected from freezing.
+ */
+ if (!ext4_journal_current_handle()) {
+ sb_start_intwrite(dquot->dq_sb);
+ freeze_protected = true;
+ }
handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
if (IS_ERR(handle)) {
/* Release dquot anyway to avoid endless cycle in dqput() */
dquot_release(dquot);
+ if (freeze_protected)
+ sb_end_intwrite(dquot->dq_sb);
return PTR_ERR(handle);
}
ret = dquot_release(dquot);
@@ -6951,6 +6974,10 @@ static int ext4_release_dquot(struct dquot *dquot)
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
+
+ if (freeze_protected)
+ sb_end_intwrite(dquot->dq_sb);
+
return ret;
}
@@ -7288,7 +7315,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
}
lock_buffer(bh);
memcpy(bh->b_data+offset, data, len);
- flush_dcache_page(bh->b_page);
+ flush_dcache_folio(bh->b_folio);
unlock_buffer(bh);
err = ext4_handle_dirty_metadata(handle, NULL, bh);
brelse(bh);
@@ -7381,12 +7408,9 @@ static struct file_system_type ext4_fs_type = {
};
MODULE_ALIAS_FS("ext4");
-/* Shared across all ext4 file systems */
-wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
-
static int __init ext4_init_fs(void)
{
- int i, err;
+ int err;
ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64);
ext4_li_info = NULL;
@@ -7394,9 +7418,6 @@ static int __init ext4_init_fs(void)
/* Build-time check for flags consistency */
ext4_check_flag_values();
- for (i = 0; i < EXT4_WQ_HASH_SZ; i++)
- init_waitqueue_head(&ext4__ioend_wq[i]);
-
err = ext4_init_es();
if (err)
return err;
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index ddb54608ca2e..987bd00f916a 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -254,6 +254,8 @@ EXT4_ATTR(journal_task, 0444, journal_task);
EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
EXT4_RW_ATTR_SBI_UL(last_trim_minblks, s_last_trim_minblks);
+EXT4_RW_ATTR_SBI_UI(sb_update_sec, s_sb_update_sec);
+EXT4_RW_ATTR_SBI_UI(sb_update_kb, s_sb_update_kb);
static unsigned int old_bump_val = 128;
EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -305,6 +307,8 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(mb_prefetch),
ATTR_LIST(mb_prefetch_limit),
ATTR_LIST(last_trim_minblks),
+ ATTR_LIST(sb_update_sec),
+ ATTR_LIST(sb_update_kb),
NULL,
};
ATTRIBUTE_GROUPS(ext4);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 7647e9f6e190..5a6fe1513fd2 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -139,12 +139,12 @@ static __le32 ext4_xattr_block_csum(struct inode *inode,
__u32 dummy_csum = 0;
int offset = offsetof(struct ext4_xattr_header, h_checksum);
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
sizeof(dsk_block_nr));
- csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset);
- csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
+ csum = ext4_chksum(csum, (__u8 *)hdr, offset);
+ csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
offset += sizeof(dummy_csum);
- csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset,
+ csum = ext4_chksum(csum, (__u8 *)hdr + offset,
EXT4_BLOCK_SIZE(inode->i_sb) - offset);
return cpu_to_le32(csum);
@@ -156,7 +156,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode,
struct ext4_xattr_header *hdr = BHDR(bh);
int ret = 1;
- if (ext4_has_metadata_csum(inode->i_sb)) {
+ if (ext4_has_feature_metadata_csum(inode->i_sb)) {
lock_buffer(bh);
ret = (hdr->h_checksum == ext4_xattr_block_csum(inode,
bh->b_blocknr, hdr));
@@ -168,7 +168,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode,
static void ext4_xattr_block_csum_set(struct inode *inode,
struct buffer_head *bh)
{
- if (ext4_has_metadata_csum(inode->i_sb))
+ if (ext4_has_feature_metadata_csum(inode->i_sb))
BHDR(bh)->h_checksum = ext4_xattr_block_csum(inode,
bh->b_blocknr, BHDR(bh));
}
@@ -308,7 +308,7 @@ __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh,
__ext4_xattr_check_block((inode), (bh), __func__, __LINE__)
-static inline int
+int
__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
void *end, const char *function, unsigned int line)
{
@@ -316,9 +316,6 @@ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
function, line);
}
-#define xattr_check_inode(inode, header, end) \
- __xattr_check_inode((inode), (header), (end), __func__, __LINE__)
-
static int
xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry,
void *end, int name_index, const char *name, int sorted)
@@ -341,7 +338,7 @@ xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry,
cmp = name_len - entry->e_name_len;
if (!cmp)
cmp = memcmp(name, entry->e_name, name_len);
- if (cmp <= 0 && (sorted || cmp == 0))
+ if (!cmp || (cmp < 0 && sorted))
break;
}
*pentry = entry;
@@ -351,7 +348,7 @@ xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry,
static u32
ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
{
- return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size);
+ return ext4_chksum(sbi->s_csum_seed, buffer, size);
}
static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)
@@ -649,10 +646,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
return error;
raw_inode = ext4_raw_inode(&iloc);
header = IHDR(inode, raw_inode);
- end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
- error = xattr_check_inode(inode, header, end);
- if (error)
- goto cleanup;
+ end = ITAIL(inode, raw_inode);
entry = IFIRST(header);
error = xattr_find_entry(inode, &entry, end, name_index, name, 0);
if (error)
@@ -783,7 +777,6 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
struct ext4_xattr_ibody_header *header;
struct ext4_inode *raw_inode;
struct ext4_iloc iloc;
- void *end;
int error;
if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
@@ -793,14 +786,9 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
return error;
raw_inode = ext4_raw_inode(&iloc);
header = IHDR(inode, raw_inode);
- end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
- error = xattr_check_inode(inode, header, end);
- if (error)
- goto cleanup;
error = ext4_xattr_list_entries(dentry, IFIRST(header),
buffer, buffer_size);
-cleanup:
brelse(iloc.bh);
return error;
}
@@ -868,7 +856,6 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
struct ext4_xattr_ibody_header *header;
struct ext4_xattr_entry *entry;
qsize_t ea_inode_refs = 0;
- void *end;
int ret;
lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem);
@@ -879,10 +866,6 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
goto out;
raw_inode = ext4_raw_inode(&iloc);
header = IHDR(inode, raw_inode);
- end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
- ret = xattr_check_inode(inode, header, end);
- if (ret)
- goto out;
for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
entry = EXT4_XATTR_NEXT(entry))
@@ -979,7 +962,7 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
* so we need to reserve credits for this eventuality
*/
if (inode && ext4_has_inline_data(inode))
- credits += ext4_writepage_trans_blocks(inode) + 1;
+ credits += ext4_chunk_trans_extent(inode, 1) + 1;
/* We are done if ea_inode feature is not enabled. */
if (!ext4_has_feature_ea_inode(sb))
@@ -1176,15 +1159,24 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
{
struct inode *ea_inode;
struct ext4_xattr_entry *entry;
+ struct ext4_iloc iloc;
bool dirty = false;
unsigned int ea_ino;
int err;
int credits;
+ void *end;
+
+ if (block_csum)
+ end = (void *)bh->b_data + bh->b_size;
+ else {
+ ext4_get_inode_loc(parent, &iloc);
+ end = (void *)ext4_raw_inode(&iloc) + EXT4_SB(parent->i_sb)->s_inode_size;
+ }
/* One credit for dec ref on ea_inode, one for orphan list addition, */
credits = 2 + extra_credits;
- for (entry = first; !IS_LAST_ENTRY(entry);
+ for (entry = first; (void *)entry < end && !IS_LAST_ENTRY(entry);
entry = EXT4_XATTR_NEXT(entry)) {
if (!entry->e_value_inum)
continue;
@@ -2235,11 +2227,8 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
header = IHDR(inode, raw_inode);
is->s.base = is->s.first = IFIRST(header);
is->s.here = is->s.first;
- is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+ is->s.end = ITAIL(inode, raw_inode);
if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
- error = xattr_check_inode(inode, header, is->s.end);
- if (error)
- return error;
/* Find the named attribute. */
error = xattr_find_entry(inode, &is->s.here, is->s.end,
i->name_index, i->name, 0);
@@ -2786,14 +2775,10 @@ retry:
*/
base = IFIRST(header);
- end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+ end = ITAIL(inode, raw_inode);
min_offs = end - base;
total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32);
- error = xattr_check_inode(inode, header, end);
- if (error)
- goto cleanup;
-
ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino);
if (ifree >= isize_diff)
goto shift;
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index b25c2d7b5f99..1fedf44d4fb6 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -67,6 +67,9 @@ struct ext4_xattr_entry {
((void *)raw_inode + \
EXT4_GOOD_OLD_INODE_SIZE + \
EXT4_I(inode)->i_extra_isize))
+#define ITAIL(inode, raw_inode) \
+ ((void *)(raw_inode) + \
+ EXT4_SB((inode)->i_sb)->s_inode_size)
#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
/*
@@ -206,6 +209,13 @@ extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
extern struct mb_cache *ext4_xattr_create_cache(void);
extern void ext4_xattr_destroy_cache(struct mb_cache *);
+extern int
+__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
+ void *end, const char *function, unsigned int line);
+
+#define xattr_check_inode(inode, header, end) \
+ __xattr_check_inode((inode), (header), (end), __func__, __LINE__)
+
#ifdef CONFIG_EXT4_FS_SECURITY
extern int ext4_init_security(handle_t *handle, struct inode *inode,
struct inode *dir, const struct qstr *qstr);