diff options
Diffstat (limited to 'fs/ext4')
36 files changed, 3506 insertions, 2686 deletions
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index ef4c19e5f570..0c5a79c3b5d4 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -68,11 +68,6 @@ extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); static inline int ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) { - /* usually, the umask is applied by posix_acl_create(), but if - ext4 ACL support is disabled at compile time, we need to do - it here, because posix_acl_create() will never be called */ - inode->i_mode &= ~current_umask(); - return 0; } #endif /* CONFIG_EXT4_FS_POSIX_ACL */ diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 396474e9e2bf..c48fd36b2d74 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -22,6 +22,7 @@ #include "mballoc.h" #include <trace/events/ext4.h> +#include <kunit/static_stub.h> static unsigned ext4_num_base_meta_clusters(struct super_block *sb, ext4_group_t block_group); @@ -111,10 +112,8 @@ static unsigned ext4_num_overhead_clusters(struct super_block *sb, itbl_blk_start = ext4_inode_table(sb, gdp); itbl_blk_end = itbl_blk_start + sbi->s_itb_per_group - 1; if (itbl_blk_start <= end && itbl_blk_end >= start) { - itbl_blk_start = itbl_blk_start >= start ? - itbl_blk_start : start; - itbl_blk_end = itbl_blk_end <= end ? - itbl_blk_end : end; + itbl_blk_start = max(itbl_blk_start, start); + itbl_blk_end = min(itbl_blk_end, end); itbl_cluster_start = EXT4_B2C(sbi, itbl_blk_start - start); itbl_cluster_end = EXT4_B2C(sbi, itbl_blk_end - start); @@ -274,6 +273,9 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh_p; + KUNIT_STATIC_STUB_REDIRECT(ext4_get_group_desc, + sb, block_group, bh); + if (block_group >= ngroups) { ext4_error(sb, "block_group >= groups_count - block_group = %u," " groups_count = %u", block_group, ngroups); @@ -468,6 +470,9 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, ext4_fsblk_t bitmap_blk; int err; + KUNIT_STATIC_STUB_REDIRECT(ext4_read_block_bitmap_nowait, + sb, block_group, ignore_locked); + desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) return ERR_PTR(-EFSCORRUPTED); @@ -564,6 +569,9 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, { struct ext4_group_desc *desc; + KUNIT_STATIC_STUB_REDIRECT(ext4_wait_block_bitmap, + sb, block_group, bh); + if (!buffer_new(bh)) return 0; desc = ext4_get_group_desc(sb, block_group, NULL); @@ -641,8 +649,8 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi, /* Hm, nope. Are (enough) root reserved clusters available? */ if (uid_eq(sbi->s_resuid, current_fsuid()) || (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || - capable(CAP_SYS_RESOURCE) || - (flags & EXT4_MB_USE_ROOT_BLOCKS)) { + (flags & EXT4_MB_USE_ROOT_BLOCKS) || + capable(CAP_SYS_RESOURCE)) { if (free_clusters >= (nclusters + dirty_clusters + resv_clusters)) diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index cd725bebe69e..2a135075468d 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -18,15 +18,17 @@ unsigned int ext4_count_free(char *bitmap, unsigned int numchars) int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, - struct buffer_head *bh, int sz) + struct buffer_head *bh) { __u32 hi; __u32 provided, calculated; struct ext4_sb_info *sbi = EXT4_SB(sb); + int sz; if (!ext4_has_metadata_csum(sb)) return 1; + sz = EXT4_INODES_PER_GROUP(sb) >> 3; provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) { @@ -40,14 +42,16 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, void ext4_inode_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, - struct buffer_head *bh, int sz) + struct buffer_head *bh) { __u32 csum; struct ext4_sb_info *sbi = EXT4_SB(sb); + int sz; if (!ext4_has_metadata_csum(sb)) return; + sz = EXT4_INODES_PER_GROUP(sb) >> 3; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 6fe3c941b565..e8c5525afc67 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -72,7 +72,7 @@ static int add_system_zone(struct ext4_system_blocks *system_blks, { struct ext4_system_zone *new_entry, *entry; struct rb_node **n = &system_blks->root.rb_node, *node; - struct rb_node *parent = NULL, *new_node = NULL; + struct rb_node *parent = NULL, *new_node; while (*n) { parent = *n; @@ -351,10 +351,9 @@ int ext4_check_blockref(const char *function, unsigned int line, { __le32 *bref = p; unsigned int blk; + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - if (ext4_has_feature_journal(inode->i_sb) && - (inode->i_ino == - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) + if (journal && inode == journal->j_inode) return 0; while (bref < p+max) { diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 453d4da5de52..0a056d97e640 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -31,11 +31,10 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, ext4_fname_from_fscrypt_name(fname, &name); -#if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, iname, fname); if (err) ext4_fname_free_filename(fname); -#endif + return err; } @@ -51,11 +50,9 @@ int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, ext4_fname_from_fscrypt_name(fname, &name); -#if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); if (err) ext4_fname_free_filename(fname); -#endif return err; } @@ -70,10 +67,7 @@ void ext4_fname_free_filename(struct ext4_filename *fname) fname->usr_fname = NULL; fname->disk_name.name = NULL; -#if IS_ENABLED(CONFIG_UNICODE) - kfree(fname->cf_name.name); - fname->cf_name.name = NULL; -#endif + ext4_fname_free_ci_filename(fname); } static bool uuid_is_zero(__u8 u[16]) @@ -232,19 +226,14 @@ static bool ext4_has_stable_inodes(struct super_block *sb) return ext4_has_feature_stable_inodes(sb); } -static void ext4_get_ino_and_lblk_bits(struct super_block *sb, - int *ino_bits_ret, int *lblk_bits_ret) -{ - *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count); - *lblk_bits_ret = 8 * sizeof(ext4_lblk_t); -} - const struct fscrypt_operations ext4_cryptops = { - .key_prefix = "ext4:", + .needs_bounce_pages = 1, + .has_32bit_inodes = 1, + .supports_subblock_data_units = 1, + .legacy_key_prefix = "ext4:", .get_context = ext4_get_context, .set_context = ext4_set_context, .get_dummy_policy = ext4_get_dummy_policy, .empty_dir = ext4_empty_dir, .has_stable_inodes = ext4_has_stable_inodes, - .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits, }; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 7ea33c3fe94e..b278b5703c19 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -104,6 +104,9 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; + else if (unlikely(next_offset == size && de->name_len == 1 && + de->name[0] == '.')) + error_msg = "'.' directory cannot be the last in data block"; else return 0; @@ -133,6 +136,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) struct super_block *sb = inode->i_sb; struct buffer_head *bh = NULL; struct fscrypt_str fstr = FSTR_INIT(NULL, 0); + struct dir_private_info *info = file->private_data; err = fscrypt_prepare_readdir(inode); if (err) @@ -192,7 +196,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) (PAGE_SHIFT - inode->i_blkbits); if (!ra_has_index(&file->f_ra, index)) page_cache_sync_readahead( - sb->s_bdev->bd_inode->i_mapping, + sb->s_bdev->bd_mapping, &file->f_ra, file, index, 1); file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT; @@ -229,7 +233,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ - if (!inode_eq_iversion(inode, file->f_version)) { + if (!inode_eq_iversion(inode, info->cookie)) { for (i = 0; i < sb->s_blocksize && i < offset; ) { de = (struct ext4_dir_entry_2 *) (bh->b_data + i); @@ -249,7 +253,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) offset = i; ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) | offset; - file->f_version = inode_query_iversion(inode); + info->cookie = inode_query_iversion(inode); } while (ctx->pos < inode->i_size @@ -392,6 +396,7 @@ static inline loff_t ext4_get_htree_eof(struct file *filp) static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; + struct dir_private_info *info = file->private_data; int dx_dir = is_dx_dir(inode); loff_t ret, htree_max = ext4_get_htree_eof(file); @@ -400,7 +405,7 @@ static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) htree_max, htree_max); else ret = ext4_llseek(file, offset, whence); - file->f_version = inode_peek_iversion(inode) - 1; + info->cookie = inode_peek_iversion(inode) - 1; return ret; } @@ -437,18 +442,15 @@ static void free_rb_tree_fname(struct rb_root *root) *root = RB_ROOT; } - -static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp, - loff_t pos) +static void ext4_htree_init_dir_info(struct file *filp, loff_t pos) { - struct dir_private_info *p; - - p = kzalloc(sizeof(*p), GFP_KERNEL); - if (!p) - return NULL; - p->curr_hash = pos2maj_hash(filp, pos); - p->curr_minor_hash = pos2min_hash(filp, pos); - return p; + struct dir_private_info *p = filp->private_data; + + if (is_dx_dir(file_inode(filp)) && !p->initialized) { + p->curr_hash = pos2maj_hash(filp, pos); + p->curr_minor_hash = pos2min_hash(filp, pos); + p->initialized = true; + } } void ext4_htree_free_dir_info(struct dir_private_info *p) @@ -560,12 +562,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) struct fname *fname; int ret = 0; - if (!info) { - info = ext4_htree_create_dir_info(file, ctx->pos); - if (!info) - return -ENOMEM; - file->private_data = info; - } + ext4_htree_init_dir_info(file, ctx->pos); if (ctx->pos == ext4_get_htree_eof(file)) return 0; /* EOF */ @@ -598,10 +595,10 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) * cached entries. */ if ((!info->curr_node) || - !inode_eq_iversion(inode, file->f_version)) { + !inode_eq_iversion(inode, info->cookie)) { info->curr_node = NULL; free_rb_tree_fname(&info->root); - file->f_version = inode_query_iversion(inode); + info->cookie = inode_query_iversion(inode); ret = ext4_htree_fill_tree(file, info->curr_hash, info->curr_minor_hash, &info->next_hash); @@ -672,7 +669,19 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, return 0; } +static int ext4_dir_open(struct inode *inode, struct file *file) +{ + struct dir_private_info *info; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + file->private_data = info; + return 0; +} + const struct file_operations ext4_dir_operations = { + .open = ext4_dir_open, .llseek = ext4_dir_llseek, .read = generic_read_dir, .iterate_shared = ext4_readdir, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3db01b933c3e..a95525bfb99c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -213,11 +213,14 @@ enum criteria { #define EXT4_MB_USE_RESERVED 0x2000 /* Do strict check for free blocks while retrying block allocation */ #define EXT4_MB_STRICT_CHECK 0x4000 -/* Large fragment size list lookup succeeded at least once for cr = 0 */ +/* Large fragment size list lookup succeeded at least once for + * CR_POWER2_ALIGNED */ #define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000 -/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ +/* Avg fragment size rb tree lookup succeeded at least once for + * CR_GOAL_LEN_FAST */ #define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000 -/* Avg fragment size rb tree lookup succeeded at least once for cr = 1.5 */ +/* Avg fragment size rb tree lookup succeeded at least once for + * CR_BEST_AVAIL_LEN */ #define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000 struct ext4_allocation_request { @@ -252,8 +255,10 @@ struct ext4_allocation_request { #define EXT4_MAP_MAPPED BIT(BH_Mapped) #define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten) #define EXT4_MAP_BOUNDARY BIT(BH_Boundary) +#define EXT4_MAP_DELAYED BIT(BH_Delay) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ - EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY) + EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ + EXT4_MAP_DELAYED) struct ext4_map_blocks { ext4_fsblk_t m_pblk; @@ -273,7 +278,8 @@ struct ext4_system_blocks { /* * Flags for ext4_io_end->flags */ -#define EXT4_IO_END_UNWRITTEN 0x0001 +#define EXT4_IO_END_UNWRITTEN 0x0001 +#define EXT4_IO_END_FAILED 0x0002 struct ext4_io_end_vec { struct list_head list; /* list of io_end_vec */ @@ -362,6 +368,8 @@ struct ext4_io_submit { #define EXT4_MAX_BLOCKS(size, offset, blkbits) \ ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \ blkbits)) +#define EXT4_B_TO_LBLK(inode, offset) \ + (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits) /* Translate a block number to a cluster number */ #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) @@ -891,10 +899,13 @@ do { \ (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX)); \ } while (0) -#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ - EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, (inode)->xtime) +#define EXT4_INODE_SET_ATIME(inode, raw_inode) \ + EXT4_INODE_SET_XTIME_VAL(i_atime, inode, raw_inode, inode_get_atime(inode)) -#define EXT4_INODE_SET_CTIME(inode, raw_inode) \ +#define EXT4_INODE_SET_MTIME(inode, raw_inode) \ + EXT4_INODE_SET_XTIME_VAL(i_mtime, inode, raw_inode, inode_get_mtime(inode)) + +#define EXT4_INODE_SET_CTIME(inode, raw_inode) \ EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode)) #define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ @@ -910,9 +921,16 @@ do { \ .tv_sec = (signed)le32_to_cpu((raw_inode)->xtime) \ }) -#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \ +#define EXT4_INODE_GET_ATIME(inode, raw_inode) \ do { \ - (inode)->xtime = EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode); \ + inode_set_atime_to_ts(inode, \ + EXT4_INODE_GET_XTIME_VAL(i_atime, inode, raw_inode)); \ +} while (0) + +#define EXT4_INODE_GET_MTIME(inode, raw_inode) \ +do { \ + inode_set_mtime_to_ts(inode, \ + EXT4_INODE_GET_XTIME_VAL(i_mtime, inode, raw_inode)); \ } while (0) #define EXT4_INODE_GET_CTIME(inode, raw_inode) \ @@ -1043,6 +1061,7 @@ struct ext4_inode_info { /* Number of ongoing updates on this inode */ atomic_t i_fc_updates; + atomic_t i_unwritten; /* Nr. of inflight conversions pending */ /* Fast commit wait queue for this inode */ wait_queue_head_t i_fc_wait; @@ -1091,6 +1110,10 @@ struct ext4_inode_info { /* mballoc */ atomic_t i_prealloc_active; + + /* allocation reservation info for delalloc */ + /* In case of bigalloc, this refer to clusters rather than blocks */ + unsigned int i_reserved_data_blocks; struct rb_root i_prealloc_node; rwlock_t i_prealloc_lock; @@ -1107,10 +1130,6 @@ struct ext4_inode_info { /* ialloc */ ext4_group_t i_last_alloc_group; - /* allocation reservation info for delalloc */ - /* In case of bigalloc, this refer to clusters rather than blocks */ - unsigned int i_reserved_data_blocks; - /* pending cluster reservations for bigalloc file systems */ struct ext4_pending_tree i_pending_tree; @@ -1134,7 +1153,6 @@ struct ext4_inode_info { */ struct list_head i_rsv_conversion_list; struct work_struct i_rsv_conversion_work; - atomic_t i_unwritten; /* Nr. of inflight conversions pending */ spinlock_t i_block_reservation_lock; @@ -1332,7 +1350,7 @@ struct ext4_super_block { /*60*/ __le32 s_feature_incompat; /* incompatible feature set */ __le32 s_feature_ro_compat; /* readonly-compatible feature set */ /*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ -/*78*/ char s_volume_name[EXT4_LABEL_MAX]; /* volume name */ +/*78*/ char s_volume_name[EXT4_LABEL_MAX] __nonstring; /* volume name */ /*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */ /*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ /* @@ -1494,6 +1512,7 @@ struct ext4_sb_info { loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ struct buffer_head * s_sbh; /* Buffer containing the super block */ struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ + /* Array of bh's for the block group descriptors */ struct buffer_head * __rcu *s_group_desc; unsigned int s_mount_opt; unsigned int s_mount_opt2; @@ -1537,7 +1556,7 @@ struct ext4_sb_info { unsigned long s_commit_interval; u32 s_max_batch_time; u32 s_min_batch_time; - struct block_device *s_journal_bdev; + struct file *s_journal_bdev_file; #ifdef CONFIG_QUOTA /* Names of quota files with journalled quota */ char __rcu *s_qf_names[EXT4_MAXQUOTAS]; @@ -1564,7 +1583,7 @@ struct ext4_sb_info { unsigned int *s_mb_maxs; unsigned int s_group_info_size; unsigned int s_mb_free_pending; - struct list_head s_freed_data_list; /* List of blocks to be freed + struct list_head s_freed_data_list[2]; /* List of blocks to be freed after commit completed */ struct list_head s_discard_list; struct work_struct s_discard_work; @@ -1653,7 +1672,7 @@ struct ext4_sb_info { __u32 s_csum_seed; /* Reclaim extents from extent status tree */ - struct shrinker s_es_shrinker; + struct shrinker *s_es_shrinker; struct list_head s_es_list; /* List of inodes with reclaimable extents */ long s_es_nr_inode; struct ext4_es_stats s_es_stats; @@ -2314,9 +2333,9 @@ struct ext4_dir_entry_2 { ((struct ext4_dir_entry_hash *) \ (((void *)(entry)) + \ ((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND))) -#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(de)->hash) +#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(entry)->hash) #define EXT4_DIRENT_MINOR_HASH(entry) \ - le32_to_cpu(EXT4_DIRENT_HASHES(de)->minor_hash) + le32_to_cpu(EXT4_DIRENT_HASHES(entry)->minor_hash) static inline bool ext4_hash_in_dirent(const struct inode *inode) { @@ -2438,6 +2457,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) #define DX_HASH_HALF_MD4_UNSIGNED 4 #define DX_HASH_TEA_UNSIGNED 5 #define DX_HASH_SIPHASH 6 +#define DX_HASH_LAST DX_HASH_SIPHASH static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, const void *address, unsigned int length) @@ -2487,7 +2507,7 @@ struct ext4_filename { struct fscrypt_str crypto_buf; #endif #if IS_ENABLED(CONFIG_UNICODE) - struct fscrypt_str cf_name; + struct qstr cf_name; #endif }; @@ -2529,6 +2549,8 @@ struct dir_private_info { __u32 curr_hash; __u32 curr_minor_hash; __u32 next_hash; + u64 cookie; + bool initialized; }; /* calculate the first block number of the group */ @@ -2669,10 +2691,10 @@ struct mmpd_data { extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); void ext4_inode_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, - struct buffer_head *bh, int sz); + struct buffer_head *bh); int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, - struct buffer_head *bh, int sz); + struct buffer_head *bh); void ext4_block_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh); @@ -2721,8 +2743,25 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); #if IS_ENABLED(CONFIG_UNICODE) extern int ext4_fname_setup_ci_filename(struct inode *dir, - const struct qstr *iname, - struct ext4_filename *fname); + const struct qstr *iname, + struct ext4_filename *fname); + +static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname) +{ + kfree(fname->cf_name.name); + fname->cf_name.name = NULL; +} +#else +static inline int ext4_fname_setup_ci_filename(struct inode *dir, + const struct qstr *iname, + struct ext4_filename *fname) +{ + return 0; +} + +static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname) +{ +} #endif /* ext4 encryption related stuff goes here crypto.c */ @@ -2745,16 +2784,11 @@ static inline int ext4_fname_setup_filename(struct inode *dir, int lookup, struct ext4_filename *fname) { - int err = 0; fname->usr_fname = iname; fname->disk_name.name = (unsigned char *) iname->name; fname->disk_name.len = iname->len; -#if IS_ENABLED(CONFIG_UNICODE) - err = ext4_fname_setup_ci_filename(dir, iname, fname); -#endif - - return err; + return ext4_fname_setup_ci_filename(dir, iname, fname); } static inline int ext4_fname_prepare_lookup(struct inode *dir, @@ -2766,10 +2800,7 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir, static inline void ext4_fname_free_filename(struct ext4_filename *fname) { -#if IS_ENABLED(CONFIG_UNICODE) - kfree(fname->cf_name.name); - fname->cf_name.name = NULL; -#endif + ext4_fname_free_ci_filename(fname); } static inline int ext4_ioctl_get_encryption_pwsalt(struct file *filp, @@ -2893,10 +2924,10 @@ extern const struct seq_operations ext4_mb_seq_groups_ops; extern const struct seq_operations ext4_mb_seq_structs_summary_ops; extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); extern int ext4_mb_init(struct super_block *); -extern int ext4_mb_release(struct super_block *); +extern void ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, struct ext4_allocation_request *, int *); -extern void ext4_discard_preallocations(struct inode *, unsigned int); +extern void ext4_discard_preallocations(struct inode *); extern int __init ext4_init_mballoc(void); extern void ext4_exit_mballoc(void); extern ext4_group_t ext4_mb_prefetch(struct super_block *sb, @@ -2917,7 +2948,7 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, - int len, int state); + int len, bool state); static inline bool ext4_mb_cr_expensive(enum criteria cr) { return cr >= CR_GOAL_LEN_SLOW; @@ -2984,6 +3015,8 @@ extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); +extern int ext4_truncate_page_cache_block_range(struct inode *inode, + loff_t start, loff_t end); extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); @@ -3338,6 +3371,13 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) return 1 << sbi->s_log_groups_per_flex; } +static inline loff_t ext4_get_maxbytes(struct inode *inode) +{ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return inode->i_sb->s_maxbytes; + return EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; +} + #define ext4_std_error(sb, errno) \ do { \ if ((errno)) \ @@ -3530,13 +3570,13 @@ int ext4_readpage_inline(struct inode *inode, struct folio *folio); extern int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, - struct page **pagep); + struct folio **foliop); int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct folio *folio); extern int ext4_da_write_inline_data_begin(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, - struct page **pagep, + struct folio **foliop, void **fsdata); extern int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, @@ -3677,11 +3717,12 @@ extern int ext4_map_blocks(handle_t *handle, struct inode *inode, extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, struct ext4_ext_path *path); -extern int ext4_ext_insert_extent(handle_t *, struct inode *, - struct ext4_ext_path **, - struct ext4_extent *, int); +extern struct ext4_ext_path *ext4_ext_insert_extent( + handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *newext, int gb_flags); extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, - struct ext4_ext_path **, + struct ext4_ext_path *, int flags); extern void ext4_free_ext_path(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); @@ -3818,6 +3859,9 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) return buffer_uptodate(bh); } +extern int ext4_block_write_begin(handle_t *handle, struct folio *folio, + loff_t pos, unsigned len, + get_block_t *get_block); #endif /* __KERNEL__ */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index d1a2e6624401..da4a82456383 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -206,7 +206,7 @@ static void ext4_journal_abort_handle(const char *caller, unsigned int line, static void ext4_check_bdev_write_error(struct super_block *sb) { - struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; + struct address_space *mapping = sb->s_bdev->bd_mapping; struct ext4_sb_info *sbi = EXT4_SB(sb); int err; @@ -235,8 +235,6 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, might_sleep(); - ext4_check_bdev_write_error(sb); - if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); if (err) { @@ -244,7 +242,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, handle, err); return err; } - } + } else + ext4_check_bdev_write_error(sb); if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) return 0; BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 5ea75af6ca22..2f9c3cd4f26c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -84,12 +84,11 @@ static void ext4_extent_block_csum_set(struct inode *inode, et->et_checksum = ext4_extent_block_csum(inode, eh); } -static int ext4_split_extent_at(handle_t *handle, - struct inode *inode, - struct ext4_ext_path **ppath, - ext4_lblk_t split, - int split_flag, - int flags); +static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t split, + int split_flag, int flags); static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) { @@ -100,27 +99,33 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) * i_rwsem. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); *dropped = 1; return 0; } +static inline void ext4_ext_path_brelse(struct ext4_ext_path *path) +{ + brelse(path->p_bh); + path->p_bh = NULL; +} + static void ext4_ext_drop_refs(struct ext4_ext_path *path) { int depth, i; - if (!path) + if (IS_ERR_OR_NULL(path)) return; depth = path->p_depth; - for (i = 0; i <= depth; i++, path++) { - brelse(path->p_bh); - path->p_bh = NULL; - } + for (i = 0; i <= depth; i++, path++) + ext4_ext_path_brelse(path); } void ext4_free_ext_path(struct ext4_ext_path *path) { + if (IS_ERR_OR_NULL(path)) + return; ext4_ext_drop_refs(path); kfree(path); } @@ -323,19 +328,18 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check) return size; } -static inline int +static inline struct ext4_ext_path * ext4_force_split_extent_at(handle_t *handle, struct inode *inode, - struct ext4_ext_path **ppath, ext4_lblk_t lblk, + struct ext4_ext_path *path, ext4_lblk_t lblk, int nofail) { - struct ext4_ext_path *path = *ppath; int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO; if (nofail) flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL; - return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ? + return ext4_split_extent_at(handle, inode, path, lblk, unwritten ? EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, flags); } @@ -635,8 +639,7 @@ int ext4_ext_precache(struct inode *inode) */ if ((i == depth) || path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) { - brelse(path[i].p_bh); - path[i].p_bh = NULL; + ext4_ext_path_brelse(path + i); i--; continue; } @@ -689,7 +692,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) struct ext4_extent *ex; int i; - if (!path) + if (IS_ERR_OR_NULL(path)) return; eh = path[depth].p_hdr; @@ -881,11 +884,10 @@ void ext4_ext_tree_init(handle_t *handle, struct inode *inode) struct ext4_ext_path * ext4_find_extent(struct inode *inode, ext4_lblk_t block, - struct ext4_ext_path **orig_path, int flags) + struct ext4_ext_path *path, int flags) { struct ext4_extent_header *eh; struct buffer_head *bh; - struct ext4_ext_path *path = orig_path ? *orig_path : NULL; short int depth, i, ppos = 0; int ret; gfp_t gfp_flags = GFP_NOFS; @@ -906,7 +908,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, ext4_ext_drop_refs(path); if (depth > path[0].p_maxdepth) { kfree(path); - *orig_path = path = NULL; + path = NULL; } } if (!path) { @@ -957,14 +959,10 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, ext4_ext_show_path(inode, path); - if (orig_path) - *orig_path = path; return path; err: ext4_free_ext_path(path); - if (orig_path) - *orig_path = NULL; return ERR_PTR(ret); } @@ -1397,15 +1395,15 @@ out: * finds empty index and adds new leaf. * if no free index is found, then it requests in-depth growing. */ -static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, - unsigned int mb_flags, - unsigned int gb_flags, - struct ext4_ext_path **ppath, - struct ext4_extent *newext) +static struct ext4_ext_path * +ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, + unsigned int mb_flags, unsigned int gb_flags, + struct ext4_ext_path *path, + struct ext4_extent *newext) { - struct ext4_ext_path *path = *ppath; struct ext4_ext_path *curp; int depth, i, err = 0; + ext4_lblk_t ee_block = le32_to_cpu(newext->ee_block); repeat: i = depth = ext_depth(inode); @@ -1424,42 +1422,38 @@ repeat: * entry: create all needed subtree and add new leaf */ err = ext4_ext_split(handle, inode, mb_flags, path, newext, i); if (err) - goto out; + goto errout; /* refill path */ - path = ext4_find_extent(inode, - (ext4_lblk_t)le32_to_cpu(newext->ee_block), - ppath, gb_flags); - if (IS_ERR(path)) - err = PTR_ERR(path); - } else { - /* tree is full, time to grow in depth */ - err = ext4_ext_grow_indepth(handle, inode, mb_flags); - if (err) - goto out; + path = ext4_find_extent(inode, ee_block, path, gb_flags); + return path; + } - /* refill path */ - path = ext4_find_extent(inode, - (ext4_lblk_t)le32_to_cpu(newext->ee_block), - ppath, gb_flags); - if (IS_ERR(path)) { - err = PTR_ERR(path); - goto out; - } + /* tree is full, time to grow in depth */ + err = ext4_ext_grow_indepth(handle, inode, mb_flags); + if (err) + goto errout; - /* - * only first (depth 0 -> 1) produces free space; - * in all other cases we have to split the grown tree - */ - depth = ext_depth(inode); - if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { - /* now we need to split */ - goto repeat; - } + /* refill path */ + path = ext4_find_extent(inode, ee_block, path, gb_flags); + if (IS_ERR(path)) + return path; + + /* + * only first (depth 0 -> 1) produces free space; + * in all other cases we have to split the grown tree + */ + depth = ext_depth(inode); + if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { + /* now we need to split */ + goto repeat; } -out: - return err; + return path; + +errout: + ext4_free_ext_path(path); + return ERR_PTR(err); } /* @@ -1751,12 +1745,23 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, break; err = ext4_ext_get_access(handle, inode, path + k); if (err) - break; + goto clean; path[k].p_idx->ei_block = border; err = ext4_ext_dirty(handle, inode, path + k); if (err) - break; + goto clean; } + return 0; + +clean: + /* + * The path[k].p_bh is either unmodified or with no verified bit + * set (see ext4_ext_get_access()). So just clear the verified bit + * of the successfully modified extents buffers, which will force + * these extents to be checked to avoid using inconsistent data. + */ + while (++k < depth) + clear_buffer_verified(path[k].p_bh); return err; } @@ -1878,8 +1883,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr)); path[0].p_hdr->eh_max = cpu_to_le16(max_root); - brelse(path[1].p_bh); - path[1].p_bh = NULL; + ext4_ext_path_brelse(path + 1); ext4_free_blocks(handle, inode, NULL, blk, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } @@ -1967,16 +1971,15 @@ out: * inserts requested extent as new one into the tree, * creating new leaf in the no-space case. */ -int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, - struct ext4_ext_path **ppath, - struct ext4_extent *newext, int gb_flags) +struct ext4_ext_path * +ext4_ext_insert_extent(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *newext, int gb_flags) { - struct ext4_ext_path *path = *ppath; struct ext4_extent_header *eh; struct ext4_extent *ex, *fex; struct ext4_extent *nearex; /* nearest extent */ - struct ext4_ext_path *npath = NULL; - int depth, len, err; + int depth, len, err = 0; ext4_lblk_t next; int mb_flags = 0, unwritten; @@ -1984,14 +1987,16 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, mb_flags |= EXT4_MB_DELALLOC_RESERVED; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); - return -EFSCORRUPTED; + err = -EFSCORRUPTED; + goto errout; } depth = ext_depth(inode); ex = path[depth].p_ext; eh = path[depth].p_hdr; if (unlikely(path[depth].p_hdr == NULL)) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); - return -EFSCORRUPTED; + err = -EFSCORRUPTED; + goto errout; } /* try to insert block into found extent and return */ @@ -2029,7 +2034,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, err = ext4_ext_get_access(handle, inode, path + depth); if (err) - return err; + goto errout; unwritten = ext4_ext_is_unwritten(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); @@ -2054,7 +2059,7 @@ prepend: err = ext4_ext_get_access(handle, inode, path + depth); if (err) - return err; + goto errout; unwritten = ext4_ext_is_unwritten(ex); ex->ee_block = newext->ee_block; @@ -2079,21 +2084,26 @@ prepend: if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) next = ext4_ext_next_leaf_block(path); if (next != EXT_MAX_BLOCKS) { + struct ext4_ext_path *npath; + ext_debug(inode, "next leaf block - %u\n", next); - BUG_ON(npath != NULL); npath = ext4_find_extent(inode, next, NULL, gb_flags); - if (IS_ERR(npath)) - return PTR_ERR(npath); + if (IS_ERR(npath)) { + err = PTR_ERR(npath); + goto errout; + } BUG_ON(npath->p_depth != path->p_depth); eh = npath[depth].p_hdr; if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { ext_debug(inode, "next leaf isn't full(%d)\n", le16_to_cpu(eh->eh_entries)); + ext4_free_ext_path(path); path = npath; goto has_space; } ext_debug(inode, "next leaf has no free space(%d,%d)\n", le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); + ext4_free_ext_path(npath); } /* @@ -2102,11 +2112,10 @@ prepend: */ if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) mb_flags |= EXT4_MB_USE_RESERVED; - err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, - ppath, newext); - if (err) - goto cleanup; - path = *ppath; + path = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, + path, newext); + if (IS_ERR(path)) + return path; depth = ext_depth(inode); eh = path[depth].p_hdr; @@ -2115,7 +2124,7 @@ has_space: err = ext4_ext_get_access(handle, inode, path + depth); if (err) - goto cleanup; + goto errout; if (!nearex) { /* there is no extent in this leaf, create first one */ @@ -2173,17 +2182,20 @@ merge: if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) ext4_ext_try_to_merge(handle, inode, path, nearex); - /* time to correct all indexes above */ err = ext4_ext_correct_indexes(handle, inode, path); if (err) - goto cleanup; + goto errout; err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (err) + goto errout; -cleanup: - ext4_free_ext_path(npath); - return err; + return path; + +errout: + ext4_free_ext_path(path); + return ERR_PTR(err); } static int ext4_fill_es_cache_info(struct inode *inode, @@ -2283,27 +2295,26 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, { int err; ext4_fsblk_t leaf; + int k = depth - 1; /* free index block */ - depth--; - path = path + depth; - leaf = ext4_idx_pblock(path->p_idx); - if (unlikely(path->p_hdr->eh_entries == 0)) { - EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); + leaf = ext4_idx_pblock(path[k].p_idx); + if (unlikely(path[k].p_hdr->eh_entries == 0)) { + EXT4_ERROR_INODE(inode, "path[%d].p_hdr->eh_entries == 0", k); return -EFSCORRUPTED; } - err = ext4_ext_get_access(handle, inode, path); + err = ext4_ext_get_access(handle, inode, path + k); if (err) return err; - if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) { - int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx; + if (path[k].p_idx != EXT_LAST_INDEX(path[k].p_hdr)) { + int len = EXT_LAST_INDEX(path[k].p_hdr) - path[k].p_idx; len *= sizeof(struct ext4_extent_idx); - memmove(path->p_idx, path->p_idx + 1, len); + memmove(path[k].p_idx, path[k].p_idx + 1, len); } - le16_add_cpu(&path->p_hdr->eh_entries, -1); - err = ext4_ext_dirty(handle, inode, path); + le16_add_cpu(&path[k].p_hdr->eh_entries, -1); + err = ext4_ext_dirty(handle, inode, path + k); if (err) return err; ext_debug(inode, "index is empty, remove it, free block %llu\n", leaf); @@ -2312,18 +2323,29 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, ext4_free_blocks(handle, inode, NULL, leaf, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); - while (--depth >= 0) { - if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr)) + while (--k >= 0) { + if (path[k + 1].p_idx != EXT_FIRST_INDEX(path[k + 1].p_hdr)) break; - path--; - err = ext4_ext_get_access(handle, inode, path); + err = ext4_ext_get_access(handle, inode, path + k); if (err) - break; - path->p_idx->ei_block = (path+1)->p_idx->ei_block; - err = ext4_ext_dirty(handle, inode, path); + goto clean; + path[k].p_idx->ei_block = path[k + 1].p_idx->ei_block; + err = ext4_ext_dirty(handle, inode, path + k); if (err) - break; + goto clean; } + return 0; + +clean: + /* + * The path[k].p_bh is either unmodified or with no verified bit + * set (see ext4_ext_get_access()). So just clear the verified bit + * of the successfully modified extents buffers, which will force + * these extents to be checked to avoid using inconsistent data. + */ + while (++k < depth) + clear_buffer_verified(path[k].p_bh); + return err; } @@ -2374,18 +2396,19 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, int ext4_ext_index_trans_blocks(struct inode *inode, int extents) { int index; - int depth; /* If we are converting the inline data, only one is needed here. */ if (ext4_has_inline_data(inode)) return 1; - depth = ext_depth(inode); - + /* + * Extent tree can change between the time we estimate credits and + * the time we actually modify the tree. Assume the worst case. + */ if (extents <= 1) - index = depth * 2; + index = EXT4_MAX_EXTENT_DEPTH * 2; else - index = depth * 3; + index = EXT4_MAX_EXTENT_DEPTH * 3; return index; } @@ -2876,11 +2899,12 @@ again: * fail removing space due to ENOSPC so try to use * reserved block if that happens. */ - err = ext4_force_split_extent_at(handle, inode, &path, - end + 1, 1); - if (err < 0) + path = ext4_force_split_extent_at(handle, inode, path, + end + 1, 1); + if (IS_ERR(path)) { + err = PTR_ERR(path); goto out; - + } } else if (sbi->s_cluster_ratio > 1 && end >= ex_end && partial.state == initial) { /* @@ -2938,8 +2962,7 @@ again: err = ext4_ext_rm_leaf(handle, inode, path, &partial, start, end); /* root level has p_bh == NULL, brelse() eats this */ - brelse(path[i].p_bh); - path[i].p_bh = NULL; + ext4_ext_path_brelse(path + i); i--; continue; } @@ -3001,8 +3024,7 @@ again: err = ext4_ext_rm_idx(handle, inode, path, i); } /* root level has p_bh == NULL, brelse() eats this */ - brelse(path[i].p_bh); - path[i].p_bh = NULL; + ext4_ext_path_brelse(path + i); i--; ext_debug(inode, "return to level %d\n", i); } @@ -3117,7 +3139,7 @@ static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) return; ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, - EXTENT_STATUS_WRITTEN); + EXTENT_STATUS_WRITTEN, 0); } /* FIXME!! we need to try to merge to left or right after zero-out */ @@ -3151,16 +3173,14 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) * a> the extent are splitted into two extent. * b> split is not needed, and just mark the extent. * - * return 0 on success. + * Return an extent path pointer on success, or an error pointer on failure. */ -static int ext4_split_extent_at(handle_t *handle, - struct inode *inode, - struct ext4_ext_path **ppath, - ext4_lblk_t split, - int split_flag, - int flags) +static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t split, + int split_flag, int flags) { - struct ext4_ext_path *path = *ppath; ext4_fsblk_t newblock; ext4_lblk_t ee_block; struct ext4_extent *ex, newex, orig_ex, zero_ex; @@ -3230,24 +3250,27 @@ static int ext4_split_extent_at(handle_t *handle, if (split_flag & EXT4_EXT_MARK_UNWRIT2) ext4_ext_mark_unwritten(ex2); - err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags); - if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM) + path = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (!IS_ERR(path)) goto out; + err = PTR_ERR(path); + if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM) + return path; + /* - * Update path is required because previous ext4_ext_insert_extent() - * may have freed or reallocated the path. Using EXT4_EX_NOFAIL - * guarantees that ext4_find_extent() will not return -ENOMEM, - * otherwise -ENOMEM will cause a retry in do_writepages(), and a - * WARN_ON may be triggered in ext4_da_update_reserve_space() due to - * an incorrect ee_len causing the i_reserved_data_blocks exception. + * Get a new path to try to zeroout or fix the extent length. + * Using EXT4_EX_NOFAIL guarantees that ext4_find_extent() + * will not return -ENOMEM, otherwise -ENOMEM will cause a + * retry in do_writepages(), and a WARN_ON may be triggered + * in ext4_da_update_reserve_space() due to an incorrect + * ee_len causing the i_reserved_data_blocks exception. */ - path = ext4_find_extent(inode, ee_block, ppath, - flags | EXT4_EX_NOFAIL); + path = ext4_find_extent(inode, ee_block, NULL, flags | EXT4_EX_NOFAIL); if (IS_ERR(path)) { EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %ld", split, PTR_ERR(path)); - return PTR_ERR(path); + return path; } depth = ext_depth(inode); ex = path[depth].p_ext; @@ -3302,10 +3325,13 @@ fix_extent_len: * and err is a non-zero error code. */ ext4_ext_dirty(handle, inode, path + path->p_depth); - return err; out: - ext4_ext_show_leaf(inode, *ppath); - return err; + if (err) { + ext4_free_ext_path(path); + path = ERR_PTR(err); + } + ext4_ext_show_leaf(inode, path); + return path; } /* @@ -3319,21 +3345,18 @@ out: * c> Splits in three extents: Somone is splitting in middle of the extent * */ -static int ext4_split_extent(handle_t *handle, - struct inode *inode, - struct ext4_ext_path **ppath, - struct ext4_map_blocks *map, - int split_flag, - int flags) +static struct ext4_ext_path *ext4_split_extent(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + struct ext4_map_blocks *map, + int split_flag, int flags, + unsigned int *allocated) { - struct ext4_ext_path *path = *ppath; ext4_lblk_t ee_block; struct ext4_extent *ex; unsigned int ee_len, depth; - int err = 0; int unwritten; int split_flag1, flags1; - int allocated = map->m_len; depth = ext_depth(inode); ex = path[depth].p_ext; @@ -3349,28 +3372,27 @@ static int ext4_split_extent(handle_t *handle, EXT4_EXT_MARK_UNWRIT2; if (split_flag & EXT4_EXT_DATA_VALID2) split_flag1 |= EXT4_EXT_DATA_VALID1; - err = ext4_split_extent_at(handle, inode, ppath, + path = ext4_split_extent_at(handle, inode, path, map->m_lblk + map->m_len, split_flag1, flags1); - if (err) - goto out; - } else { - allocated = ee_len - (map->m_lblk - ee_block); - } - /* - * Update path is required because previous ext4_split_extent_at() may - * result in split of original leaf or extent zeroout. - */ - path = ext4_find_extent(inode, map->m_lblk, ppath, flags); - if (IS_ERR(path)) - return PTR_ERR(path); - depth = ext_depth(inode); - ex = path[depth].p_ext; - if (!ex) { - EXT4_ERROR_INODE(inode, "unexpected hole at %lu", - (unsigned long) map->m_lblk); - return -EFSCORRUPTED; + if (IS_ERR(path)) + return path; + /* + * Update path is required because previous ext4_split_extent_at + * may result in split of original leaf or extent zeroout. + */ + path = ext4_find_extent(inode, map->m_lblk, path, flags); + if (IS_ERR(path)) + return path; + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + ext4_free_ext_path(path); + return ERR_PTR(-EFSCORRUPTED); + } + unwritten = ext4_ext_is_unwritten(ex); } - unwritten = ext4_ext_is_unwritten(ex); if (map->m_lblk >= ee_block) { split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; @@ -3379,15 +3401,20 @@ static int ext4_split_extent(handle_t *handle, split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | EXT4_EXT_MARK_UNWRIT2); } - err = ext4_split_extent_at(handle, inode, ppath, + path = ext4_split_extent_at(handle, inode, path, map->m_lblk, split_flag1, flags); - if (err) - goto out; + if (IS_ERR(path)) + return path; } - ext4_ext_show_leaf(inode, *ppath); -out: - return err ? err : allocated; + if (allocated) { + if (map->m_lblk + map->m_len > ee_block + ee_len) + *allocated = ee_len - (map->m_lblk - ee_block); + else + *allocated = map->m_len; + } + ext4_ext_show_leaf(inode, path); + return path; } /* @@ -3410,13 +3437,11 @@ out: * that are allocated and initialized. * It is guaranteed to be >= map->m_len. */ -static int ext4_ext_convert_to_initialized(handle_t *handle, - struct inode *inode, - struct ext4_map_blocks *map, - struct ext4_ext_path **ppath, - int flags) +static struct ext4_ext_path * +ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, struct ext4_ext_path *path, + int flags, unsigned int *allocated) { - struct ext4_ext_path *path = *ppath; struct ext4_sb_info *sbi; struct ext4_extent_header *eh; struct ext4_map_blocks split_map; @@ -3426,7 +3451,6 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, unsigned int ee_len, depth, map_len = map->m_len; int err = 0; int split_flag = EXT4_EXT_DATA_VALID2; - int allocated = 0; unsigned int max_zeroout = 0; ext_debug(inode, "logical block %llu, max_blocks %u\n", @@ -3467,6 +3491,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, * - L2: we only attempt to merge with an extent stored in the * same extent tree node. */ + *allocated = 0; if ((map->m_lblk == ee_block) && /* See if we can merge left */ (map_len < ee_len) && /*L1*/ @@ -3496,7 +3521,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ err = ext4_ext_get_access(handle, inode, path + depth); if (err) - goto out; + goto errout; trace_ext4_ext_convert_to_initialized_fastpath(inode, map, ex, abut_ex); @@ -3511,7 +3536,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, abut_ex->ee_len = cpu_to_le16(prev_len + map_len); /* Result: number of initialized blocks past m_lblk */ - allocated = map_len; + *allocated = map_len; } } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) && (map_len < ee_len) && /*L1*/ @@ -3542,7 +3567,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ err = ext4_ext_get_access(handle, inode, path + depth); if (err) - goto out; + goto errout; trace_ext4_ext_convert_to_initialized_fastpath(inode, map, ex, abut_ex); @@ -3557,18 +3582,20 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, abut_ex->ee_len = cpu_to_le16(next_len + map_len); /* Result: number of initialized blocks past m_lblk */ - allocated = map_len; + *allocated = map_len; } } - if (allocated) { + if (*allocated) { /* Mark the block containing both extents as dirty */ err = ext4_ext_dirty(handle, inode, path + depth); /* Update path to point to the right extent */ path[depth].p_ext = abut_ex; + if (err) + goto errout; goto out; } else - allocated = ee_len - (map->m_lblk - ee_block); + *allocated = ee_len - (map->m_lblk - ee_block); WARN_ON(map->m_lblk < ee_block); /* @@ -3595,21 +3622,21 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, split_map.m_lblk = map->m_lblk; split_map.m_len = map->m_len; - if (max_zeroout && (allocated > split_map.m_len)) { - if (allocated <= max_zeroout) { + if (max_zeroout && (*allocated > split_map.m_len)) { + if (*allocated <= max_zeroout) { /* case 3 or 5 */ zero_ex1.ee_block = cpu_to_le32(split_map.m_lblk + split_map.m_len); zero_ex1.ee_len = - cpu_to_le16(allocated - split_map.m_len); + cpu_to_le16(*allocated - split_map.m_len); ext4_ext_store_pblock(&zero_ex1, ext4_ext_pblock(ex) + split_map.m_lblk + split_map.m_len - ee_block); err = ext4_ext_zeroout(inode, &zero_ex1); if (err) goto fallback; - split_map.m_len = allocated; + split_map.m_len = *allocated; } if (split_map.m_lblk - ee_block + split_map.m_len < max_zeroout) { @@ -3627,22 +3654,24 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, split_map.m_len += split_map.m_lblk - ee_block; split_map.m_lblk = ee_block; - allocated = map->m_len; + *allocated = map->m_len; } } fallback: - err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag, - flags); - if (err > 0) - err = 0; + path = ext4_split_extent(handle, inode, path, &split_map, split_flag, + flags, NULL); + if (IS_ERR(path)) + return path; out: /* If we have gotten a failure, don't zero out status tree */ - if (!err) { - ext4_zeroout_es(inode, &zero_ex1); - ext4_zeroout_es(inode, &zero_ex2); - } - return err ? err : allocated; + ext4_zeroout_es(inode, &zero_ex1); + ext4_zeroout_es(inode, &zero_ex2); + return path; + +errout: + ext4_free_ext_path(path); + return ERR_PTR(err); } /* @@ -3667,15 +3696,16 @@ out: * being filled will be convert to initialized by the end_io callback function * via ext4_convert_unwritten_extents(). * - * Returns the size of unwritten extent to be written on success. + * The size of unwritten extent to be written is passed to the caller via the + * allocated pointer. Return an extent path pointer on success, or an error + * pointer on failure. */ -static int ext4_split_convert_extents(handle_t *handle, +static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path **ppath, - int flags) + struct ext4_ext_path *path, + int flags, unsigned int *allocated) { - struct ext4_ext_path *path = *ppath; ext4_lblk_t eof_block; ext4_lblk_t ee_block; struct ext4_extent *ex; @@ -3708,15 +3738,15 @@ static int ext4_split_convert_extents(handle_t *handle, split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); } flags |= EXT4_GET_BLOCKS_PRE_IO; - return ext4_split_extent(handle, inode, ppath, map, split_flag, flags); + return ext4_split_extent(handle, inode, path, map, split_flag, flags, + allocated); } -static int ext4_convert_unwritten_extents_endio(handle_t *handle, - struct inode *inode, - struct ext4_map_blocks *map, - struct ext4_ext_path **ppath) +static struct ext4_ext_path * +ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path) { - struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; ext4_lblk_t ee_block; unsigned int ee_len; @@ -3744,20 +3774,21 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, inode->i_ino, (unsigned long long)ee_block, ee_len, (unsigned long long)map->m_lblk, map->m_len); #endif - err = ext4_split_convert_extents(handle, inode, map, ppath, - EXT4_GET_BLOCKS_CONVERT); - if (err < 0) - return err; - path = ext4_find_extent(inode, map->m_lblk, ppath, 0); + path = ext4_split_convert_extents(handle, inode, map, path, + EXT4_GET_BLOCKS_CONVERT, NULL); if (IS_ERR(path)) - return PTR_ERR(path); + return path; + + path = ext4_find_extent(inode, map->m_lblk, path, 0); + if (IS_ERR(path)) + return path; depth = ext_depth(inode); ex = path[depth].p_ext; } err = ext4_ext_get_access(handle, inode, path + depth); if (err) - goto out; + goto errout; /* first mark the extent as initialized */ ext4_ext_mark_initialized(ex); @@ -3768,18 +3799,23 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, /* Mark modified extent as dirty */ err = ext4_ext_dirty(handle, inode, path + path->p_depth); -out: + if (err) + goto errout; + ext4_ext_show_leaf(inode, path); - return err; + return path; + +errout: + ext4_free_ext_path(path); + return ERR_PTR(err); } -static int +static struct ext4_ext_path * convert_initialized_extent(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path **ppath, + struct ext4_ext_path *path, unsigned int *allocated) { - struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; ext4_lblk_t ee_block; unsigned int ee_len; @@ -3802,25 +3838,27 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, (unsigned long long)ee_block, ee_len); if (ee_block != map->m_lblk || ee_len > map->m_len) { - err = ext4_split_convert_extents(handle, inode, map, ppath, - EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); - if (err < 0) - return err; - path = ext4_find_extent(inode, map->m_lblk, ppath, 0); + path = ext4_split_convert_extents(handle, inode, map, path, + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, NULL); if (IS_ERR(path)) - return PTR_ERR(path); + return path; + + path = ext4_find_extent(inode, map->m_lblk, path, 0); + if (IS_ERR(path)) + return path; depth = ext_depth(inode); ex = path[depth].p_ext; if (!ex) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) map->m_lblk); - return -EFSCORRUPTED; + err = -EFSCORRUPTED; + goto errout; } } err = ext4_ext_get_access(handle, inode, path + depth); if (err) - return err; + goto errout; /* first mark the extent as unwritten */ ext4_ext_mark_unwritten(ex); @@ -3832,7 +3870,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, /* Mark modified extent as dirty */ err = ext4_ext_dirty(handle, inode, path + path->p_depth); if (err) - return err; + goto errout; ext4_ext_show_leaf(inode, path); ext4_update_inode_fsync_trans(handle, inode, 1); @@ -3841,22 +3879,25 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, if (*allocated > map->m_len) *allocated = map->m_len; map->m_len = *allocated; - return 0; + return path; + +errout: + ext4_free_ext_path(path); + return ERR_PTR(err); } -static int +static struct ext4_ext_path * ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path **ppath, int flags, - unsigned int allocated, ext4_fsblk_t newblock) + struct ext4_ext_path *path, int flags, + unsigned int *allocated, ext4_fsblk_t newblock) { - int ret = 0; int err = 0; ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n", (unsigned long long)map->m_lblk, map->m_len, flags, - allocated); - ext4_ext_show_leaf(inode, *ppath); + *allocated); + ext4_ext_show_leaf(inode, path); /* * When writing into unwritten space, we should not fail to @@ -3865,36 +3906,34 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; trace_ext4_ext_handle_unwritten_extents(inode, map, flags, - allocated, newblock); + *allocated, newblock); /* get_block() before submitting IO, split the extent */ if (flags & EXT4_GET_BLOCKS_PRE_IO) { - ret = ext4_split_convert_extents(handle, inode, map, ppath, - flags | EXT4_GET_BLOCKS_CONVERT); - if (ret < 0) { - err = ret; - goto out2; - } + path = ext4_split_convert_extents(handle, inode, map, path, + flags | EXT4_GET_BLOCKS_CONVERT, allocated); + if (IS_ERR(path)) + return path; /* - * shouldn't get a 0 return when splitting an extent unless + * shouldn't get a 0 allocated when splitting an extent unless * m_len is 0 (bug) or extent has been corrupted */ - if (unlikely(ret == 0)) { + if (unlikely(*allocated == 0)) { EXT4_ERROR_INODE(inode, - "unexpected ret == 0, m_len = %u", + "unexpected allocated == 0, m_len = %u", map->m_len); err = -EFSCORRUPTED; - goto out2; + goto errout; } map->m_flags |= EXT4_MAP_UNWRITTEN; goto out; } /* IO end_io complete, convert the filled extent to written */ if (flags & EXT4_GET_BLOCKS_CONVERT) { - err = ext4_convert_unwritten_extents_endio(handle, inode, map, - ppath); - if (err < 0) - goto out2; + path = ext4_convert_unwritten_extents_endio(handle, inode, + map, path); + if (IS_ERR(path)) + return path; ext4_update_inode_fsync_trans(handle, inode, 1); goto map_out; } @@ -3926,36 +3965,37 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, * For buffered writes, at writepage time, etc. Convert a * discovered unwritten extent to written. */ - ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags); - if (ret < 0) { - err = ret; - goto out2; - } + path = ext4_ext_convert_to_initialized(handle, inode, map, path, + flags, allocated); + if (IS_ERR(path)) + return path; ext4_update_inode_fsync_trans(handle, inode, 1); /* - * shouldn't get a 0 return when converting an unwritten extent + * shouldn't get a 0 allocated when converting an unwritten extent * unless m_len is 0 (bug) or extent has been corrupted */ - if (unlikely(ret == 0)) { - EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u", + if (unlikely(*allocated == 0)) { + EXT4_ERROR_INODE(inode, "unexpected allocated == 0, m_len = %u", map->m_len); err = -EFSCORRUPTED; - goto out2; + goto errout; } out: - allocated = ret; map->m_flags |= EXT4_MAP_NEW; map_out: map->m_flags |= EXT4_MAP_MAPPED; out1: map->m_pblk = newblock; - if (allocated > map->m_len) - allocated = map->m_len; - map->m_len = allocated; - ext4_ext_show_leaf(inode, *ppath); -out2: - return err ? err : allocated; + if (*allocated > map->m_len) + *allocated = map->m_len; + map->m_len = *allocated; + ext4_ext_show_leaf(inode, path); + return path; + +errout: + ext4_free_ext_path(path); + return ERR_PTR(err); } /* @@ -4098,8 +4138,11 @@ again: /* * The delalloc extent containing lblk, it must have been * added after ext4_map_blocks() checked the extent status - * tree, adjust the length to the delalloc extent's after - * lblk. + * tree so we are not holding i_rwsem and delalloc info is + * only stabilized by i_data_sem we are going to release + * soon. Don't modify the extent status tree and report + * extent as a hole, just adjust the length to the delalloc + * extent's after lblk. */ len = es.es_lblk + es.es_len - lblk; return len; @@ -4115,7 +4158,8 @@ again: insert_hole: /* Put just found gap into cache to speed up subsequent requests */ ext_debug(inode, " -> %u:%u\n", hole_start, len); - ext4_es_insert_extent(inode, hole_start, len, ~0, EXTENT_STATUS_HOLE); + ext4_es_insert_extent(inode, hole_start, len, ~0, + EXTENT_STATUS_HOLE, 0); /* Update hole_len to reflect hole size after lblk */ if (hole_start != lblk) @@ -4130,10 +4174,10 @@ insert_hole: * * Need to be called with * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block - * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) + * (ie, flags is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) * * return > 0, number of blocks already mapped/allocated - * if create == 0 and these are pre-allocated blocks + * if flags doesn't contain EXT4_GET_BLOCKS_CREATE and these are pre-allocated blocks * buffer head is unmapped * otherwise blocks are mapped * @@ -4149,7 +4193,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_extent newex, *ex, ex2; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_fsblk_t newblock = 0, pblk; - int err = 0, depth, ret; + int err = 0, depth; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; @@ -4162,7 +4206,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, path = ext4_find_extent(inode, map->m_lblk, NULL, 0); if (IS_ERR(path)) { err = PTR_ERR(path); - path = NULL; goto out; } @@ -4211,8 +4254,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, */ if ((!ext4_ext_is_unwritten(ex)) && (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { - err = convert_initialized_extent(handle, - inode, map, &path, &allocated); + path = convert_initialized_extent(handle, + inode, map, path, &allocated); + if (IS_ERR(path)) + err = PTR_ERR(path); goto out; } else if (!ext4_ext_is_unwritten(ex)) { map->m_flags |= EXT4_MAP_MAPPED; @@ -4224,20 +4269,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, goto out; } - ret = ext4_ext_handle_unwritten_extents( - handle, inode, map, &path, flags, - allocated, newblock); - if (ret < 0) - err = ret; - else - allocated = ret; + path = ext4_ext_handle_unwritten_extents( + handle, inode, map, path, flags, + &allocated, newblock); + if (IS_ERR(path)) + err = PTR_ERR(path); goto out; } } /* * requested block isn't allocated yet; - * we couldn't try to create block if create flag is zero + * we couldn't try to create block if flags doesn't contain EXT4_GET_BLOCKS_CREATE */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { ext4_lblk_t len; @@ -4282,6 +4325,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; + err = 0; goto got_allocated_blocks; } @@ -4354,8 +4398,9 @@ got_allocated_blocks: map->m_flags |= EXT4_MAP_UNWRITTEN; } - err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags); - if (err) { + path = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (IS_ERR(path)) { + err = PTR_ERR(path); if (allocated_clusters) { int fb_flags = 0; @@ -4364,7 +4409,7 @@ got_allocated_blocks: * not a good idea to call discard here directly, * but otherwise we'd need to call it every free(). */ - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE; ext4_free_blocks(handle, inode, NULL, newblock, @@ -4375,43 +4420,6 @@ got_allocated_blocks: } /* - * Reduce the reserved cluster count to reflect successful deferred - * allocation of delayed allocated clusters or direct allocation of - * clusters discovered to be delayed allocated. Once allocated, a - * cluster is not included in the reserved count. - */ - if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) { - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { - /* - * When allocating delayed allocated clusters, simply - * reduce the reserved cluster count and claim quota - */ - ext4_da_update_reserve_space(inode, allocated_clusters, - 1); - } else { - ext4_lblk_t lblk, len; - unsigned int n; - - /* - * When allocating non-delayed allocated clusters - * (from fallocate, filemap, DIO, or clusters - * allocated when delalloc has been disabled by - * ext4_nonda_switch), reduce the reserved cluster - * count by the number of allocated clusters that - * have previously been delayed allocated. Quota - * has been claimed by ext4_mb_new_blocks() above, - * so release the quota reservations made for any - * previously delayed allocated clusters. - */ - lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk); - len = allocated_clusters << sbi->s_cluster_bits; - n = ext4_es_delayed_clu(inode, lblk, len); - if (n > 0) - ext4_da_update_reserve_space(inode, (int) n, 0); - } - } - - /* * Cache the extent and update transaction to commit on fdatasync only * when it is _not_ an unwritten extent. */ @@ -4475,7 +4483,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, int depth = 0; struct ext4_map_blocks map; unsigned int credits; - loff_t epos; + loff_t epos, old_size = i_size_read(inode); BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)); map.m_lblk = offset; @@ -4532,7 +4540,13 @@ retry: if (epos > new_size) epos = new_size; if (ext4_update_inode_size(inode, epos) & 0x1) - inode->i_mtime = inode_get_ctime(inode); + inode_set_mtime_to_ts(inode, + inode_get_ctime(inode)); + if (epos > old_size) { + pagecache_isize_extended(inode, old_size, epos); + ext4_zero_partial_blocks(handle, inode, + old_size, epos - old_size); + } } ret2 = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); @@ -4555,131 +4569,65 @@ static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) { struct inode *inode = file_inode(file); - struct address_space *mapping = file->f_mapping; handle_t *handle = NULL; - unsigned int max_blocks; loff_t new_size = 0; - int ret = 0; - int flags; - int credits; - int partial_begin, partial_end; - loff_t start, end; - ext4_lblk_t lblk; + loff_t end = offset + len; + ext4_lblk_t start_lblk, end_lblk; + unsigned int blocksize = i_blocksize(inode); unsigned int blkbits = inode->i_blkbits; + int ret, flags, credits; trace_ext4_zero_range(inode, offset, len, mode); + WARN_ON_ONCE(!inode_is_locked(inode)); - /* - * Round up offset. This is not fallocate, we need to zero out - * blocks, so convert interior block aligned part of the range to - * unwritten and possibly manually zero out unaligned parts of the - * range. Here, start and partial_begin are inclusive, end and - * partial_end are exclusive. - */ - start = round_up(offset, 1 << blkbits); - end = round_down((offset + len), 1 << blkbits); - - if (start < offset || end > offset + len) - return -EINVAL; - partial_begin = offset & ((1 << blkbits) - 1); - partial_end = (offset + len) & ((1 << blkbits) - 1); - - lblk = start >> blkbits; - max_blocks = (end >> blkbits); - if (max_blocks < lblk) - max_blocks = 0; - else - max_blocks -= lblk; - - inode_lock(inode); - - /* - * Indirect files do not support unwritten extents - */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { - ret = -EOPNOTSUPP; - goto out_mutex; - } + /* Indirect files do not support unwritten extents */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EOPNOTSUPP; if (!(mode & FALLOC_FL_KEEP_SIZE) && - (offset + len > inode->i_size || - offset + len > EXT4_I(inode)->i_disksize)) { - new_size = offset + len; + (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) { + new_size = end; ret = inode_newsize_ok(inode, new_size); if (ret) - goto out_mutex; + return ret; } flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; - - /* Wait all existing dio workers, newcomers will block on i_rwsem */ - inode_dio_wait(inode); - - ret = file_modified(file); - if (ret) - goto out_mutex; - /* Preallocate the range including the unaligned edges */ - if (partial_begin || partial_end) { - ret = ext4_alloc_file_blocks(file, - round_down(offset, 1 << blkbits) >> blkbits, - (round_up((offset + len), 1 << blkbits) - - round_down(offset, 1 << blkbits)) >> blkbits, - new_size, flags); - if (ret) - goto out_mutex; + if (!IS_ALIGNED(offset | end, blocksize)) { + ext4_lblk_t alloc_lblk = offset >> blkbits; + ext4_lblk_t len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits); + ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk, + new_size, flags); + if (ret) + return ret; } - /* Zero range excluding the unaligned edges */ - if (max_blocks > 0) { - flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | - EXT4_EX_NOCACHE); - - /* - * Prevent page faults from reinstantiating pages we have - * released from page cache. - */ - filemap_invalidate_lock(mapping); - - ret = ext4_break_layouts(inode); - if (ret) { - filemap_invalidate_unlock(mapping); - goto out_mutex; - } - - ret = ext4_update_disksize_before_punch(inode, offset, len); - if (ret) { - filemap_invalidate_unlock(mapping); - goto out_mutex; - } - - /* - * For journalled data we need to write (and checkpoint) pages - * before discarding page cache to avoid inconsitent data on - * disk in case of crash before zeroing trans is committed. - */ - if (ext4_should_journal_data(inode)) { - ret = filemap_write_and_wait_range(mapping, start, - end - 1); - if (ret) { - filemap_invalidate_unlock(mapping); - goto out_mutex; - } - } + ret = ext4_update_disksize_before_punch(inode, offset, len); + if (ret) + return ret; - /* Now release the pages and zero block aligned part of pages */ - truncate_pagecache_range(inode, start, end - 1); - inode->i_mtime = inode_set_ctime_current(inode); + /* Now release the pages and zero block aligned part of pages */ + ret = ext4_truncate_page_cache_block_range(inode, offset, end); + if (ret) + return ret; - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, - flags); - filemap_invalidate_unlock(mapping); + /* Zero range excluding the unaligned edges */ + start_lblk = EXT4_B_TO_LBLK(inode, offset); + end_lblk = end >> blkbits; + if (end_lblk > start_lblk) { + ext4_lblk_t zero_blks = end_lblk - start_lblk; + + flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE); + ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks, + new_size, flags); if (ret) - goto out_mutex; + return ret; } - if (!partial_begin && !partial_end) - goto out_mutex; + /* Finish zeroing out if it doesn't contain partial block */ + if (IS_ALIGNED(offset | end, blocksize)) + return ret; /* * In worst case we have to writeout two nonadjacent unwritten @@ -4692,27 +4640,69 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(inode->i_sb, ret); - goto out_mutex; + return ret; } - inode->i_mtime = inode_set_ctime_current(inode); + /* Zero out partial block at the edges of the range */ + ret = ext4_zero_partial_blocks(handle, inode, offset, len); + if (ret) + goto out_handle; + if (new_size) ext4_update_inode_size(inode, new_size); ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) goto out_handle; - /* Zero out partial block at the edges of the range */ - ret = ext4_zero_partial_blocks(handle, inode, offset, len); - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); + ext4_update_inode_fsync_trans(handle, inode, 1); if (file->f_flags & O_SYNC) ext4_handle_sync(handle); out_handle: ext4_journal_stop(handle); -out_mutex: - inode_unlock(inode); + return ret; +} + +static long ext4_do_fallocate(struct file *file, loff_t offset, + loff_t len, int mode) +{ + struct inode *inode = file_inode(file); + loff_t end = offset + len; + loff_t new_size = 0; + ext4_lblk_t start_lblk, len_lblk; + int ret; + + trace_ext4_fallocate_enter(inode, offset, len, mode); + WARN_ON_ONCE(!inode_is_locked(inode)); + + start_lblk = offset >> inode->i_blkbits; + len_lblk = EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits); + + /* We only support preallocation for extent-based files only. */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + ret = -EOPNOTSUPP; + goto out; + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) { + new_size = end; + ret = inode_newsize_ok(inode, new_size); + if (ret) + goto out; + } + + ret = ext4_alloc_file_blocks(file, start_lblk, len_lblk, new_size, + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); + if (ret) + goto out; + + if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { + ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, + EXT4_I(inode)->i_sync_tid); + } +out: + trace_ext4_fallocate_exit(inode, offset, len_lblk, ret); return ret; } @@ -4726,12 +4716,8 @@ out_mutex: long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - loff_t new_size = 0; - unsigned int max_blocks; - int ret = 0; - int flags; - ext4_lblk_t lblk; - unsigned int blkbits = inode->i_blkbits; + struct address_space *mapping = file->f_mapping; + int ret; /* * Encrypted inodes can't handle collapse range or insert @@ -4751,73 +4737,47 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) inode_lock(inode); ret = ext4_convert_inline_data(inode); - inode_unlock(inode); if (ret) - goto exit; - - if (mode & FALLOC_FL_PUNCH_HOLE) { - ret = ext4_punch_hole(file, offset, len); - goto exit; - } + goto out_inode_lock; - if (mode & FALLOC_FL_COLLAPSE_RANGE) { - ret = ext4_collapse_range(file, offset, len); - goto exit; - } + /* Wait all existing dio workers, newcomers will block on i_rwsem */ + inode_dio_wait(inode); - if (mode & FALLOC_FL_INSERT_RANGE) { - ret = ext4_insert_range(file, offset, len); - goto exit; - } + ret = file_modified(file); + if (ret) + goto out_inode_lock; - if (mode & FALLOC_FL_ZERO_RANGE) { - ret = ext4_zero_range(file, offset, len, mode); - goto exit; + if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ALLOCATE_RANGE) { + ret = ext4_do_fallocate(file, offset, len, mode); + goto out_inode_lock; } - trace_ext4_fallocate_enter(inode, offset, len, mode); - lblk = offset >> blkbits; - - max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); - flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; - - inode_lock(inode); /* - * We only support preallocation for extent-based files only + * Follow-up operations will drop page cache, hold invalidate lock + * to prevent page faults from reinstantiating pages we have + * released from page cache. */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { - ret = -EOPNOTSUPP; - goto out; - } - - if (!(mode & FALLOC_FL_KEEP_SIZE) && - (offset + len > inode->i_size || - offset + len > EXT4_I(inode)->i_disksize)) { - new_size = offset + len; - ret = inode_newsize_ok(inode, new_size); - if (ret) - goto out; - } - - /* Wait all existing dio workers, newcomers will block on i_rwsem */ - inode_dio_wait(inode); + filemap_invalidate_lock(mapping); - ret = file_modified(file); + ret = ext4_break_layouts(inode); if (ret) - goto out; + goto out_invalidate_lock; - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); - if (ret) - goto out; + if (mode & FALLOC_FL_PUNCH_HOLE) + ret = ext4_punch_hole(file, offset, len); + else if (mode & FALLOC_FL_COLLAPSE_RANGE) + ret = ext4_collapse_range(file, offset, len); + else if (mode & FALLOC_FL_INSERT_RANGE) + ret = ext4_insert_range(file, offset, len); + else if (mode & FALLOC_FL_ZERO_RANGE) + ret = ext4_zero_range(file, offset, len, mode); + else + ret = -EOPNOTSUPP; - if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { - ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, - EXT4_I(inode)->i_sync_tid); - } -out: +out_invalidate_lock: + filemap_invalidate_unlock(mapping); +out_inode_lock: inode_unlock(inode); - trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); -exit: return ret; } @@ -4972,12 +4932,7 @@ static const struct iomap_ops ext4_iomap_xattr_ops = { static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len) { - u64 maxbytes; - - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - maxbytes = inode->i_sb->s_maxbytes; - else - maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; + u64 maxbytes = ext4_get_maxbytes(inode); if (*len == 0) return -EINVAL; @@ -5201,7 +5156,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, * won't be shifted beyond EXT_MAX_BLOCKS. */ if (SHIFT == SHIFT_LEFT) { - path = ext4_find_extent(inode, start - 1, &path, + path = ext4_find_extent(inode, start - 1, path, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); @@ -5250,7 +5205,7 @@ again: * becomes NULL to indicate the end of the loop. */ while (iterator && start <= stop) { - path = ext4_find_extent(inode, *iterator, &path, + path = ext4_find_extent(inode, *iterator, path, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); @@ -5319,109 +5274,72 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; - ext4_lblk_t punch_start, punch_stop; + loff_t end = offset + len; + ext4_lblk_t start_lblk, end_lblk; handle_t *handle; unsigned int credits; - loff_t new_size, ioffset; + loff_t start, new_size; int ret; - /* - * We need to test this early because xfstests assumes that a - * collapse range of (0, 1) will return EOPNOTSUPP if the file - * system does not support collapse range. - */ + trace_ext4_collapse_range(inode, offset, len); + WARN_ON_ONCE(!inode_is_locked(inode)); + + /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; - /* Collapse range works only on fs cluster size aligned regions. */ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; - - trace_ext4_collapse_range(inode, offset, len); - - punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); - punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); - - inode_lock(inode); /* * There is no need to overlap collapse range with EOF, in which case * it is effectively a truncate operation */ - if (offset + len >= inode->i_size) { - ret = -EINVAL; - goto out_mutex; - } - - /* Currently just for extent based files */ - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - ret = -EOPNOTSUPP; - goto out_mutex; - } - - /* Wait for existing dio to complete */ - inode_dio_wait(inode); - - ret = file_modified(file); - if (ret) - goto out_mutex; - - /* - * Prevent page faults from reinstantiating pages we have released from - * page cache. - */ - filemap_invalidate_lock(mapping); - - ret = ext4_break_layouts(inode); - if (ret) - goto out_mmap; + if (end >= inode->i_size) + return -EINVAL; /* + * Write tail of the last page before removed range and data that + * will be shifted since they will get removed from the page cache + * below. We are also protected from pages becoming dirty by + * i_rwsem and invalidate_lock. * Need to round down offset to be aligned with page size boundary * for page size > block size. */ - ioffset = round_down(offset, PAGE_SIZE); - /* - * Write tail of the last page before removed range since it will get - * removed from the page cache below. - */ - ret = filemap_write_and_wait_range(mapping, ioffset, offset); - if (ret) - goto out_mmap; - /* - * Write data that will be shifted to preserve them when discarding - * page cache below. We are also protected from pages becoming dirty - * by i_rwsem and invalidate_lock. - */ - ret = filemap_write_and_wait_range(mapping, offset + len, - LLONG_MAX); + start = round_down(offset, PAGE_SIZE); + ret = filemap_write_and_wait_range(mapping, start, offset); + if (!ret) + ret = filemap_write_and_wait_range(mapping, end, LLONG_MAX); if (ret) - goto out_mmap; - truncate_pagecache(inode, ioffset); + return ret; + + truncate_pagecache(inode, start); credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out_mmap; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); + start_lblk = offset >> inode->i_blkbits; + end_lblk = (offset + len) >> inode->i_blkbits; + down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode, 0); - ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start); + ext4_discard_preallocations(inode); + ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk); - ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); + ret = ext4_ext_remove_space(inode, start_lblk, end_lblk - 1); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; + goto out_handle; } - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); - ret = ext4_ext_shift_extents(inode, handle, punch_stop, - punch_stop - punch_start, SHIFT_LEFT); + ret = ext4_ext_shift_extents(inode, handle, end_lblk, + end_lblk - start_lblk, SHIFT_LEFT); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; + goto out_handle; } new_size = inode->i_size - len; @@ -5429,18 +5347,16 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) EXT4_I(inode)->i_disksize = new_size; up_write(&EXT4_I(inode)->i_data_sem); - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - inode->i_mtime = inode_set_ctime_current(inode); ret = ext4_mark_inode_dirty(handle, inode); + if (ret) + goto out_handle; + ext4_update_inode_fsync_trans(handle, inode, 1); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); -out_stop: +out_handle: ext4_journal_stop(handle); -out_mmap: - filemap_invalidate_unlock(mapping); -out_mutex: - inode_unlock(inode); return ret; } @@ -5460,100 +5376,63 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) handle_t *handle; struct ext4_ext_path *path; struct ext4_extent *extent; - ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0; + ext4_lblk_t start_lblk, len_lblk, ee_start_lblk = 0; unsigned int credits, ee_len; - int ret = 0, depth, split_flag = 0; - loff_t ioffset; + int ret, depth, split_flag = 0; + loff_t start; - /* - * We need to test this early because xfstests assumes that an - * insert range of (0, 1) will return EOPNOTSUPP if the file - * system does not support insert range. - */ + trace_ext4_insert_range(inode, offset, len); + WARN_ON_ONCE(!inode_is_locked(inode)); + + /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; - /* Insert range works only on fs cluster size aligned regions. */ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; - - trace_ext4_insert_range(inode, offset, len); - - offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); - len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb); - - inode_lock(inode); - /* Currently just for extent based files */ - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - ret = -EOPNOTSUPP; - goto out_mutex; - } - - /* Check whether the maximum file size would be exceeded */ - if (len > inode->i_sb->s_maxbytes - inode->i_size) { - ret = -EFBIG; - goto out_mutex; - } - /* Offset must be less than i_size */ - if (offset >= inode->i_size) { - ret = -EINVAL; - goto out_mutex; - } - - /* Wait for existing dio to complete */ - inode_dio_wait(inode); - - ret = file_modified(file); - if (ret) - goto out_mutex; + if (offset >= inode->i_size) + return -EINVAL; + /* Check whether the maximum file size would be exceeded */ + if (len > inode->i_sb->s_maxbytes - inode->i_size) + return -EFBIG; /* - * Prevent page faults from reinstantiating pages we have released from - * page cache. + * Write out all dirty pages. Need to round down to align start offset + * to page size boundary for page size > block size. */ - filemap_invalidate_lock(mapping); - - ret = ext4_break_layouts(inode); + start = round_down(offset, PAGE_SIZE); + ret = filemap_write_and_wait_range(mapping, start, LLONG_MAX); if (ret) - goto out_mmap; + return ret; - /* - * Need to round down to align start offset to page size boundary - * for page size > block size. - */ - ioffset = round_down(offset, PAGE_SIZE); - /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, - LLONG_MAX); - if (ret) - goto out_mmap; - truncate_pagecache(inode, ioffset); + truncate_pagecache(inode, start); credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out_mmap; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; EXT4_I(inode)->i_disksize += len; - inode->i_mtime = inode_set_ctime_current(inode); ret = ext4_mark_inode_dirty(handle, inode); if (ret) - goto out_stop; + goto out_handle; + + start_lblk = offset >> inode->i_blkbits; + len_lblk = len >> inode->i_blkbits; down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); - path = ext4_find_extent(inode, offset_lblk, NULL, 0); + path = ext4_find_extent(inode, start_lblk, NULL, 0); if (IS_ERR(path)) { up_write(&EXT4_I(inode)->i_data_sem); ret = PTR_ERR(path); - goto out_stop; + goto out_handle; } depth = ext_depth(inode); @@ -5563,51 +5442,47 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) ee_len = ext4_ext_get_actual_len(extent); /* - * If offset_lblk is not the starting block of extent, split - * the extent @offset_lblk + * If start_lblk is not the starting block of extent, split + * the extent @start_lblk */ - if ((offset_lblk > ee_start_lblk) && - (offset_lblk < (ee_start_lblk + ee_len))) { + if ((start_lblk > ee_start_lblk) && + (start_lblk < (ee_start_lblk + ee_len))) { if (ext4_ext_is_unwritten(extent)) split_flag = EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2; - ret = ext4_split_extent_at(handle, inode, &path, - offset_lblk, split_flag, + path = ext4_split_extent_at(handle, inode, path, + start_lblk, split_flag, EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_METADATA_NOFAIL); } - ext4_free_ext_path(path); - if (ret < 0) { + if (IS_ERR(path)) { up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; + ret = PTR_ERR(path); + goto out_handle; } - } else { - ext4_free_ext_path(path); } - ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk); + ext4_free_ext_path(path); + ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk); /* - * if offset_lblk lies in a hole which is at start of file, use + * if start_lblk lies in a hole which is at start of file, use * ee_start_lblk to shift extents */ ret = ext4_ext_shift_extents(inode, handle, - max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT); - + max(ee_start_lblk, start_lblk), len_lblk, SHIFT_RIGHT); up_write(&EXT4_I(inode)->i_data_sem); + if (ret) + goto out_handle; + + ext4_update_inode_fsync_trans(handle, inode, 1); if (IS_SYNC(inode)) ext4_handle_sync(handle); - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); -out_stop: +out_handle: ext4_journal_stop(handle); -out_mmap: - filemap_invalidate_unlock(mapping); -out_mutex: - inode_unlock(inode); return ret; } @@ -5654,25 +5529,21 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, int e1_len, e2_len, len; int split = 0; - path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE); + path1 = ext4_find_extent(inode1, lblk1, path1, EXT4_EX_NOCACHE); if (IS_ERR(path1)) { *erp = PTR_ERR(path1); - path1 = NULL; - finish: - count = 0; - goto repeat; + goto errout; } - path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE); + path2 = ext4_find_extent(inode2, lblk2, path2, EXT4_EX_NOCACHE); if (IS_ERR(path2)) { *erp = PTR_ERR(path2); - path2 = NULL; - goto finish; + goto errout; } ex1 = path1[path1->p_depth].p_ext; ex2 = path2[path2->p_depth].p_ext; /* Do we have something to swap ? */ if (unlikely(!ex2 || !ex1)) - goto finish; + goto errout; e1_blk = le32_to_cpu(ex1->ee_block); e2_blk = le32_to_cpu(ex2->ee_block); @@ -5694,7 +5565,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, next2 = e2_blk; /* Do we have something to swap */ if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS) - goto finish; + goto errout; /* Move to the rightest boundary */ len = next1 - lblk1; if (len < next2 - lblk2) @@ -5704,28 +5575,32 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, lblk1 += len; lblk2 += len; count -= len; - goto repeat; + continue; } /* Prepare left boundary */ if (e1_blk < lblk1) { split = 1; - *erp = ext4_force_split_extent_at(handle, inode1, - &path1, lblk1, 0); - if (unlikely(*erp)) - goto finish; + path1 = ext4_force_split_extent_at(handle, inode1, + path1, lblk1, 0); + if (IS_ERR(path1)) { + *erp = PTR_ERR(path1); + goto errout; + } } if (e2_blk < lblk2) { split = 1; - *erp = ext4_force_split_extent_at(handle, inode2, - &path2, lblk2, 0); - if (unlikely(*erp)) - goto finish; + path2 = ext4_force_split_extent_at(handle, inode2, + path2, lblk2, 0); + if (IS_ERR(path2)) { + *erp = PTR_ERR(path2); + goto errout; + } } /* ext4_split_extent_at() may result in leaf extent split, * path must to be revalidated. */ if (split) - goto repeat; + continue; /* Prepare right boundary */ len = count; @@ -5736,30 +5611,34 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, if (len != e1_len) { split = 1; - *erp = ext4_force_split_extent_at(handle, inode1, - &path1, lblk1 + len, 0); - if (unlikely(*erp)) - goto finish; + path1 = ext4_force_split_extent_at(handle, inode1, + path1, lblk1 + len, 0); + if (IS_ERR(path1)) { + *erp = PTR_ERR(path1); + goto errout; + } } if (len != e2_len) { split = 1; - *erp = ext4_force_split_extent_at(handle, inode2, - &path2, lblk2 + len, 0); - if (*erp) - goto finish; + path2 = ext4_force_split_extent_at(handle, inode2, + path2, lblk2 + len, 0); + if (IS_ERR(path2)) { + *erp = PTR_ERR(path2); + goto errout; + } } /* ext4_split_extent_at() may result in leaf extent split, * path must to be revalidated. */ if (split) - goto repeat; + continue; BUG_ON(e2_len != e1_len); *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth); if (unlikely(*erp)) - goto finish; + goto errout; *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth); if (unlikely(*erp)) - goto finish; + goto errout; /* Both extents are fully inside boundaries. Swap it now */ tmp_ex = *ex1; @@ -5777,7 +5656,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, *erp = ext4_ext_dirty(handle, inode2, path2 + path2->p_depth); if (unlikely(*erp)) - goto finish; + goto errout; *erp = ext4_ext_dirty(handle, inode1, path1 + path1->p_depth); /* @@ -5787,17 +5666,17 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, * aborted anyway. */ if (unlikely(*erp)) - goto finish; + goto errout; + lblk1 += len; lblk2 += len; replaced_count += len; count -= len; - - repeat: - ext4_free_ext_path(path1); - ext4_free_ext_path(path2); - path1 = path2 = NULL; } + +errout: + ext4_free_ext_path(path1); + ext4_free_ext_path(path2); return replaced_count; } @@ -5832,11 +5711,8 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) /* search for the extent closest to the first block in the cluster */ path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0); - if (IS_ERR(path)) { - err = PTR_ERR(path); - path = NULL; - goto out; - } + if (IS_ERR(path)) + return PTR_ERR(path); depth = ext_depth(inode); @@ -5915,26 +5791,31 @@ int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, ext4_ext_get_actual_len(ex) != len) { /* We need to split this extent to match our extent first */ down_write(&EXT4_I(inode)->i_data_sem); - ret = ext4_force_split_extent_at(NULL, inode, &path, start, 1); + path = ext4_force_split_extent_at(NULL, inode, path, start, 1); up_write(&EXT4_I(inode)->i_data_sem); - if (ret) + if (IS_ERR(path)) { + ret = PTR_ERR(path); goto out; + } - path = ext4_find_extent(inode, start, &path, 0); + path = ext4_find_extent(inode, start, path, 0); if (IS_ERR(path)) return PTR_ERR(path); + ex = path[path->p_depth].p_ext; WARN_ON(le32_to_cpu(ex->ee_block) != start); if (ext4_ext_get_actual_len(ex) != len) { down_write(&EXT4_I(inode)->i_data_sem); - ret = ext4_force_split_extent_at(NULL, inode, &path, - start + len, 1); + path = ext4_force_split_extent_at(NULL, inode, path, + start + len, 1); up_write(&EXT4_I(inode)->i_data_sem); - if (ret) + if (IS_ERR(path)) { + ret = PTR_ERR(path); goto out; + } - path = ext4_find_extent(inode, start, &path, 0); + path = ext4_find_extent(inode, start, path, 0); if (IS_ERR(path)) return PTR_ERR(path); ex = path[path->p_depth].p_ext; @@ -6018,12 +5899,9 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) if (IS_ERR(path)) return PTR_ERR(path); ex = path[path->p_depth].p_ext; - if (!ex) { - ext4_free_ext_path(path); + if (!ex) goto out; - } end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); - ext4_free_ext_path(path); /* Count the number of data blocks */ cur = 0; @@ -6049,32 +5927,28 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) ret = skip_hole(inode, &cur); if (ret < 0) goto out; - path = ext4_find_extent(inode, cur, NULL, 0); + path = ext4_find_extent(inode, cur, path, 0); if (IS_ERR(path)) goto out; numblks += path->p_depth; - ext4_free_ext_path(path); while (cur < end) { - path = ext4_find_extent(inode, cur, NULL, 0); + path = ext4_find_extent(inode, cur, path, 0); if (IS_ERR(path)) break; ex = path[path->p_depth].p_ext; - if (!ex) { - ext4_free_ext_path(path); - return 0; - } + if (!ex) + goto cleanup; + cur = max(cur + 1, le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)); ret = skip_hole(inode, &cur); - if (ret < 0) { - ext4_free_ext_path(path); + if (ret < 0) break; - } - path2 = ext4_find_extent(inode, cur, NULL, 0); - if (IS_ERR(path2)) { - ext4_free_ext_path(path); + + path2 = ext4_find_extent(inode, cur, path2, 0); + if (IS_ERR(path2)) break; - } + for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) { cmp1 = cmp2 = 0; if (i <= path->p_depth) @@ -6086,13 +5960,14 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) if (cmp1 != cmp2 && cmp2 != 0) numblks++; } - ext4_free_ext_path(path); - ext4_free_ext_path(path2); } out: inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9); ext4_mark_inode_dirty(NULL, inode); +cleanup: + ext4_free_ext_path(path); + ext4_free_ext_path(path2); return 0; } @@ -6113,12 +5988,9 @@ int ext4_ext_clear_bb(struct inode *inode) if (IS_ERR(path)) return PTR_ERR(path); ex = path[path->p_depth].p_ext; - if (!ex) { - ext4_free_ext_path(path); - return 0; - } + if (!ex) + goto out; end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); - ext4_free_ext_path(path); cur = 0; while (cur < end) { @@ -6128,23 +6000,25 @@ int ext4_ext_clear_bb(struct inode *inode) if (ret < 0) break; if (ret > 0) { - path = ext4_find_extent(inode, map.m_lblk, NULL, 0); - if (!IS_ERR_OR_NULL(path)) { + path = ext4_find_extent(inode, map.m_lblk, path, 0); + if (!IS_ERR(path)) { for (j = 0; j < path->p_depth; j++) { - ext4_mb_mark_bb(inode->i_sb, - path[j].p_block, 1, 0); + path[j].p_block, 1, false); ext4_fc_record_regions(inode->i_sb, inode->i_ino, 0, path[j].p_block, 1, 1); } - ext4_free_ext_path(path); + } else { + path = NULL; } - ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); ext4_fc_record_regions(inode->i_sb, inode->i_ino, map.m_lblk, map.m_pblk, map.m_len, 1); } cur = cur + map.m_len; } +out: + ext4_free_ext_path(path); return 0; } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index d9d5cfb9c951..c786691dabd3 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -558,8 +558,8 @@ static int ext4_es_can_be_merged(struct extent_status *es1, if (ext4_es_is_hole(es1)) return 1; - /* we need to check delayed extent is without unwritten status */ - if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1)) + /* we need to check delayed extent */ + if (ext4_es_is_delayed(es1)) return 1; return 0; @@ -848,11 +848,12 @@ out: */ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, - unsigned int status) + unsigned int status, int flags) { struct extent_status newes; ext4_lblk_t end = lblk + len - 1; int err1 = 0, err2 = 0, err3 = 0; + int resv_used = 0, pending = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct extent_status *es1 = NULL; struct extent_status *es2 = NULL; @@ -862,21 +863,14 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; - es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n", - lblk, len, pblk, status, inode->i_ino); + es_debug("add [%u/%u) %llu %x %x to extent status tree of inode %lu\n", + lblk, len, pblk, status, flags, inode->i_ino); if (!len) return; BUG_ON(end < lblk); - - if ((status & EXTENT_STATUS_DELAYED) && - (status & EXTENT_STATUS_WRITTEN)) { - ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as " - " delayed and written which can potentially " - " cause data loss.", lblk, len); - WARN_ON(1); - } + WARN_ON_ONCE(status & EXTENT_STATUS_DELAYED); newes.es_lblk = lblk; newes.es_len = len; @@ -894,11 +888,11 @@ retry: es1 = __es_alloc_extent(true); if ((err1 || err2) && !es2) es2 = __es_alloc_extent(true); - if ((err1 || err2 || err3) && revise_pending && !pr) + if ((err1 || err2 || err3 < 0) && revise_pending && !pr) pr = __alloc_pending(true); write_lock(&EXT4_I(inode)->i_es_lock); - err1 = __es_remove_extent(inode, lblk, end, NULL, es1); + err1 = __es_remove_extent(inode, lblk, end, &resv_used, es1); if (err1 != 0) goto error; /* Free preallocated extent if it didn't get used. */ @@ -922,16 +916,38 @@ retry: if (revise_pending) { err3 = __revise_pending(inode, lblk, len, &pr); - if (err3 != 0) + if (err3 < 0) goto error; if (pr) { __free_pending(pr); pr = NULL; } + pending = err3; } error: write_unlock(&EXT4_I(inode)->i_es_lock); - if (err1 || err2 || err3) + /* + * Reduce the reserved cluster count to reflect successful deferred + * allocation of delayed allocated clusters or direct allocation of + * clusters discovered to be delayed allocated. Once allocated, a + * cluster is not included in the reserved count. + * + * When direct allocating (from fallocate, filemap, DIO, or clusters + * allocated when delalloc has been disabled by ext4_nonda_switch()) + * an extent either 1) contains delayed blocks but start with + * non-delayed allocated blocks (e.g. hole) or 2) contains non-delayed + * allocated blocks which belong to delayed allocated clusters when + * bigalloc feature is enabled, quota has already been claimed by + * ext4_mb_new_blocks(), so release the quota reservations made for + * any previously delayed allocated clusters instead of claim them + * again. + */ + resv_used += pending; + if (resv_used) + ext4_da_update_reserve_space(inode, resv_used, + flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE); + + if (err1 || err2 || err3 < 0) goto retry; ext4_es_print_tree(inode); @@ -1051,7 +1067,7 @@ out: } struct rsvd_count { - int ndelonly; + int ndelayed; bool first_do_lblk_found; ext4_lblk_t first_do_lblk; ext4_lblk_t last_do_lblk; @@ -1077,10 +1093,10 @@ static void init_rsvd(struct inode *inode, ext4_lblk_t lblk, struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct rb_node *node; - rc->ndelonly = 0; + rc->ndelayed = 0; /* - * for bigalloc, note the first delonly block in the range has not + * for bigalloc, note the first delayed block in the range has not * been found, record the extent containing the block to the left of * the region to be removed, if any, and note that there's no partial * cluster to track @@ -1100,9 +1116,8 @@ static void init_rsvd(struct inode *inode, ext4_lblk_t lblk, } /* - * count_rsvd - count the clusters containing delayed and not unwritten - * (delonly) blocks in a range within an extent and add to - * the running tally in rsvd_count + * count_rsvd - count the clusters containing delayed blocks in a range + * within an extent and add to the running tally in rsvd_count * * @inode - file containing extent * @lblk - first block in range @@ -1119,13 +1134,13 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t i, end, nclu; - if (!ext4_es_is_delonly(es)) + if (!ext4_es_is_delayed(es)) return; WARN_ON(len <= 0); if (sbi->s_cluster_ratio == 1) { - rc->ndelonly += (int) len; + rc->ndelayed += (int) len; return; } @@ -1135,7 +1150,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, end = lblk + (ext4_lblk_t) len - 1; end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end; - /* record the first block of the first delonly extent seen */ + /* record the first block of the first delayed extent seen */ if (!rc->first_do_lblk_found) { rc->first_do_lblk = i; rc->first_do_lblk_found = true; @@ -1149,7 +1164,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, * doesn't start with it, count it and stop tracking */ if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) { - rc->ndelonly++; + rc->ndelayed++; rc->partial = false; } @@ -1159,7 +1174,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, */ if (EXT4_LBLK_COFF(sbi, i) != 0) { if (end >= EXT4_LBLK_CFILL(sbi, i)) { - rc->ndelonly++; + rc->ndelayed++; rc->partial = false; i = EXT4_LBLK_CFILL(sbi, i) + 1; } @@ -1167,11 +1182,11 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, /* * if the current cluster starts on a cluster boundary, count the - * number of whole delonly clusters in the extent + * number of whole delayed clusters in the extent */ if ((i + sbi->s_cluster_ratio - 1) <= end) { nclu = (end - i + 1) >> sbi->s_cluster_bits; - rc->ndelonly += nclu; + rc->ndelayed += nclu; i += nclu << sbi->s_cluster_bits; } @@ -1231,10 +1246,9 @@ static struct pending_reservation *__pr_tree_search(struct rb_root *root, * @rc - pointer to reserved count data * * The number of reservations to be released is equal to the number of - * clusters containing delayed and not unwritten (delonly) blocks within - * the range, minus the number of clusters still containing delonly blocks - * at the ends of the range, and minus the number of pending reservations - * within the range. + * clusters containing delayed blocks within the range, minus the number of + * clusters still containing delayed blocks at the ends of the range, and + * minus the number of pending reservations within the range. */ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, struct extent_status *right_es, @@ -1245,33 +1259,33 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; struct rb_node *node; ext4_lblk_t first_lclu, last_lclu; - bool left_delonly, right_delonly, count_pending; + bool left_delayed, right_delayed, count_pending; struct extent_status *es; if (sbi->s_cluster_ratio > 1) { /* count any remaining partial cluster */ if (rc->partial) - rc->ndelonly++; + rc->ndelayed++; - if (rc->ndelonly == 0) + if (rc->ndelayed == 0) return 0; first_lclu = EXT4_B2C(sbi, rc->first_do_lblk); last_lclu = EXT4_B2C(sbi, rc->last_do_lblk); /* - * decrease the delonly count by the number of clusters at the - * ends of the range that still contain delonly blocks - + * decrease the delayed count by the number of clusters at the + * ends of the range that still contain delayed blocks - * these clusters still need to be reserved */ - left_delonly = right_delonly = false; + left_delayed = right_delayed = false; es = rc->left_es; while (es && ext4_es_end(es) >= EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) { - if (ext4_es_is_delonly(es)) { - rc->ndelonly--; - left_delonly = true; + if (ext4_es_is_delayed(es)) { + rc->ndelayed--; + left_delayed = true; break; } node = rb_prev(&es->rb_node); @@ -1279,7 +1293,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, break; es = rb_entry(node, struct extent_status, rb_node); } - if (right_es && (!left_delonly || first_lclu != last_lclu)) { + if (right_es && (!left_delayed || first_lclu != last_lclu)) { if (end < ext4_es_end(right_es)) { es = right_es; } else { @@ -1289,9 +1303,9 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, } while (es && es->es_lblk <= EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) { - if (ext4_es_is_delonly(es)) { - rc->ndelonly--; - right_delonly = true; + if (ext4_es_is_delayed(es)) { + rc->ndelayed--; + right_delayed = true; break; } node = rb_next(&es->rb_node); @@ -1305,21 +1319,21 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, /* * Determine the block range that should be searched for * pending reservations, if any. Clusters on the ends of the - * original removed range containing delonly blocks are + * original removed range containing delayed blocks are * excluded. They've already been accounted for and it's not * possible to determine if an associated pending reservation * should be released with the information available in the * extents status tree. */ if (first_lclu == last_lclu) { - if (left_delonly | right_delonly) + if (left_delayed | right_delayed) count_pending = false; else count_pending = true; } else { - if (left_delonly) + if (left_delayed) first_lclu++; - if (right_delonly) + if (right_delayed) last_lclu--; if (first_lclu <= last_lclu) count_pending = true; @@ -1330,13 +1344,13 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, /* * a pending reservation found between first_lclu and last_lclu * represents an allocated cluster that contained at least one - * delonly block, so the delonly total must be reduced by one + * delayed block, so the delayed total must be reduced by one * for each pending reservation found and released */ if (count_pending) { pr = __pr_tree_search(&tree->root, first_lclu); while (pr && pr->lclu <= last_lclu) { - rc->ndelonly--; + rc->ndelayed--; node = rb_next(&pr->rb_node); rb_erase(&pr->rb_node, &tree->root); __free_pending(pr); @@ -1347,7 +1361,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, } } } - return rc->ndelonly; + return rc->ndelayed; } @@ -1634,7 +1648,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink, unsigned long nr; struct ext4_sb_info *sbi; - sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); + sbi = shrink->private_data; nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); return nr; @@ -1643,8 +1657,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink, static unsigned long ext4_es_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct ext4_sb_info *sbi = container_of(shrink, - struct ext4_sb_info, s_es_shrinker); + struct ext4_sb_info *sbi = shrink->private_data; int nr_to_scan = sc->nr_to_scan; int ret, nr_shrunk; @@ -1728,13 +1741,17 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) if (err) goto err3; - sbi->s_es_shrinker.scan_objects = ext4_es_scan; - sbi->s_es_shrinker.count_objects = ext4_es_count; - sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; - err = register_shrinker(&sbi->s_es_shrinker, "ext4-es:%s", - sbi->s_sb->s_id); - if (err) + sbi->s_es_shrinker = shrinker_alloc(0, "ext4-es:%s", sbi->s_sb->s_id); + if (!sbi->s_es_shrinker) { + err = -ENOMEM; goto err4; + } + + sbi->s_es_shrinker->scan_objects = ext4_es_scan; + sbi->s_es_shrinker->count_objects = ext4_es_count; + sbi->s_es_shrinker->private_data = sbi; + + shrinker_register(sbi->s_es_shrinker); return 0; err4: @@ -1754,7 +1771,7 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses); percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); - unregister_shrinker(&sbi->s_es_shrinker); + shrinker_free(sbi->s_es_shrinker); } /* @@ -1937,7 +1954,7 @@ static struct pending_reservation *__get_pending(struct inode *inode, * @lblk - logical block in the cluster to be added * @prealloc - preallocated pending entry * - * Returns 0 on successful insertion and -ENOMEM on failure. If the + * Returns 1 on successful insertion and -ENOMEM on failure. If the * pending reservation is already in the set, returns successfully. */ static int __insert_pending(struct inode *inode, ext4_lblk_t lblk, @@ -1981,6 +1998,7 @@ static int __insert_pending(struct inode *inode, ext4_lblk_t lblk, rb_link_node(&pr->rb_node, parent, p); rb_insert_color(&pr->rb_node, &tree->root); + ret = 1; out: return ret; @@ -2051,34 +2069,49 @@ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk) } /* - * ext4_es_insert_delayed_block - adds a delayed block to the extents status - * tree, adding a pending reservation where - * needed + * ext4_es_insert_delayed_extent - adds some delayed blocks to the extents + * status tree, adding a pending reservation + * where needed * * @inode - file containing the newly added block - * @lblk - logical block to be added - * @allocated - indicates whether a physical cluster has been allocated for - * the logical cluster that contains the block + * @lblk - start logical block to be added + * @len - length of blocks to be added + * @lclu_allocated/end_allocated - indicates whether a physical cluster has + * been allocated for the logical cluster + * that contains the start/end block. Note that + * end_allocated should always be set to false + * if the start and the end block are in the + * same cluster */ -void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, - bool allocated) +void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, bool lclu_allocated, + bool end_allocated) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct extent_status newes; + ext4_lblk_t end = lblk + len - 1; int err1 = 0, err2 = 0, err3 = 0; struct extent_status *es1 = NULL; struct extent_status *es2 = NULL; - struct pending_reservation *pr = NULL; + struct pending_reservation *pr1 = NULL; + struct pending_reservation *pr2 = NULL; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; - es_debug("add [%u/1) delayed to extent status tree of inode %lu\n", - lblk, inode->i_ino); + es_debug("add [%u/%u) delayed to extent status tree of inode %lu\n", + lblk, len, inode->i_ino); + if (!len) + return; + + WARN_ON_ONCE((EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) && + end_allocated); newes.es_lblk = lblk; - newes.es_len = 1; + newes.es_len = len; ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED); - trace_ext4_es_insert_delayed_block(inode, &newes, allocated); + trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated, + end_allocated); ext4_es_insert_extent_check(inode, &newes); @@ -2087,11 +2120,15 @@ retry: es1 = __es_alloc_extent(true); if ((err1 || err2) && !es2) es2 = __es_alloc_extent(true); - if ((err1 || err2 || err3) && allocated && !pr) - pr = __alloc_pending(true); + if (err1 || err2 || err3 < 0) { + if (lclu_allocated && !pr1) + pr1 = __alloc_pending(true); + if (end_allocated && !pr2) + pr2 = __alloc_pending(true); + } write_lock(&EXT4_I(inode)->i_es_lock); - err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1); + err1 = __es_remove_extent(inode, lblk, end, NULL, es1); if (err1 != 0) goto error; /* Free preallocated extent if it didn't get used. */ @@ -2111,18 +2148,27 @@ retry: es2 = NULL; } - if (allocated) { - err3 = __insert_pending(inode, lblk, &pr); - if (err3 != 0) + if (lclu_allocated) { + err3 = __insert_pending(inode, lblk, &pr1); + if (err3 < 0) goto error; - if (pr) { - __free_pending(pr); - pr = NULL; + if (pr1) { + __free_pending(pr1); + pr1 = NULL; + } + } + if (end_allocated) { + err3 = __insert_pending(inode, end, &pr2); + if (err3 < 0) + goto error; + if (pr2) { + __free_pending(pr2); + pr2 = NULL; } } error: write_unlock(&EXT4_I(inode)->i_es_lock); - if (err1 || err2 || err3) + if (err1 || err2 || err3 < 0) goto retry; ext4_es_print_tree(inode); @@ -2131,94 +2177,6 @@ error: } /* - * __es_delayed_clu - count number of clusters containing blocks that - * are delayed only - * - * @inode - file containing block range - * @start - logical block defining start of range - * @end - logical block defining end of range - * - * Returns the number of clusters containing only delayed (not delayed - * and unwritten) blocks in the range specified by @start and @end. Any - * cluster or part of a cluster within the range and containing a delayed - * and not unwritten block within the range is counted as a whole cluster. - */ -static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start, - ext4_lblk_t end) -{ - struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; - struct extent_status *es; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct rb_node *node; - ext4_lblk_t first_lclu, last_lclu; - unsigned long long last_counted_lclu; - unsigned int n = 0; - - /* guaranteed to be unequal to any ext4_lblk_t value */ - last_counted_lclu = ~0ULL; - - es = __es_tree_search(&tree->root, start); - - while (es && (es->es_lblk <= end)) { - if (ext4_es_is_delonly(es)) { - if (es->es_lblk <= start) - first_lclu = EXT4_B2C(sbi, start); - else - first_lclu = EXT4_B2C(sbi, es->es_lblk); - - if (ext4_es_end(es) >= end) - last_lclu = EXT4_B2C(sbi, end); - else - last_lclu = EXT4_B2C(sbi, ext4_es_end(es)); - - if (first_lclu == last_counted_lclu) - n += last_lclu - first_lclu; - else - n += last_lclu - first_lclu + 1; - last_counted_lclu = last_lclu; - } - node = rb_next(&es->rb_node); - if (!node) - break; - es = rb_entry(node, struct extent_status, rb_node); - } - - return n; -} - -/* - * ext4_es_delayed_clu - count number of clusters containing blocks that - * are both delayed and unwritten - * - * @inode - file containing block range - * @lblk - logical block defining start of range - * @len - number of blocks in range - * - * Locking for external use of __es_delayed_clu(). - */ -unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - ext4_lblk_t end; - unsigned int n; - - if (len == 0) - return 0; - - end = lblk + len - 1; - WARN_ON(end < lblk); - - read_lock(&ei->i_es_lock); - - n = __es_delayed_clu(inode, lblk, end); - - read_unlock(&ei->i_es_lock); - - return n; -} - -/* * __revise_pending - makes, cancels, or leaves unchanged pending cluster * reservations for a specified block range depending * upon the presence or absence of delayed blocks @@ -2232,7 +2190,9 @@ unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, * * Used after a newly allocated extent is added to the extents status tree. * Requires that the extents in the range have either written or unwritten - * status. Must be called while holding i_es_lock. + * status. Must be called while holding i_es_lock. Returns number of new + * inserts pending cluster on insert pendings, returns 0 on remove pendings, + * return -ENOMEM on failure. */ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, @@ -2242,6 +2202,7 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t end = lblk + len - 1; ext4_lblk_t first, last; bool f_del = false, l_del = false; + int pendings = 0; int ret = 0; if (len == 0) @@ -2263,49 +2224,53 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) { first = EXT4_LBLK_CMASK(sbi, lblk); if (first != lblk) - f_del = __es_scan_range(inode, &ext4_es_is_delonly, + f_del = __es_scan_range(inode, &ext4_es_is_delayed, first, lblk - 1); if (f_del) { ret = __insert_pending(inode, first, prealloc); if (ret < 0) goto out; + pendings += ret; } else { last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; if (last != end) l_del = __es_scan_range(inode, - &ext4_es_is_delonly, + &ext4_es_is_delayed, end + 1, last); if (l_del) { ret = __insert_pending(inode, last, prealloc); if (ret < 0) goto out; + pendings += ret; } else __remove_pending(inode, last); } } else { first = EXT4_LBLK_CMASK(sbi, lblk); if (first != lblk) - f_del = __es_scan_range(inode, &ext4_es_is_delonly, + f_del = __es_scan_range(inode, &ext4_es_is_delayed, first, lblk - 1); if (f_del) { ret = __insert_pending(inode, first, prealloc); if (ret < 0) goto out; + pendings += ret; } else __remove_pending(inode, first); last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; if (last != end) - l_del = __es_scan_range(inode, &ext4_es_is_delonly, + l_del = __es_scan_range(inode, &ext4_es_is_delayed, end + 1, last); if (l_del) { ret = __insert_pending(inode, last, prealloc); if (ret < 0) goto out; + pendings += ret; } else __remove_pending(inode, last); } out: - return ret; + return (ret < 0) ? ret : pendings; } diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index d9847a4a25db..4424232de298 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -42,6 +42,10 @@ enum { #define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS) #define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT) +/* + * Besides EXTENT_STATUS_REFERENCED, all these extent type masks + * are exclusive, only one type can be set at a time. + */ #define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B) #define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) #define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) @@ -51,7 +55,9 @@ enum { #define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \ EXTENT_STATUS_UNWRITTEN | \ EXTENT_STATUS_DELAYED | \ - EXTENT_STATUS_HOLE) << ES_SHIFT) + EXTENT_STATUS_HOLE)) + +#define ES_TYPE_VALID(type) ((type) && !((type) & ((type) - 1))) struct ext4_sb_info; struct ext4_extent; @@ -129,7 +135,7 @@ extern void ext4_es_init_tree(struct ext4_es_tree *tree); extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, - unsigned int status); + unsigned int status, int flags); extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status); @@ -156,7 +162,7 @@ static inline unsigned int ext4_es_status(struct extent_status *es) static inline unsigned int ext4_es_type(struct extent_status *es) { - return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT; + return (es->es_pblk >> ES_SHIFT) & ES_TYPE_MASK; } static inline int ext4_es_is_written(struct extent_status *es) @@ -184,11 +190,6 @@ static inline int ext4_es_is_mapped(struct extent_status *es) return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); } -static inline int ext4_es_is_delonly(struct extent_status *es) -{ - return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es)); -} - static inline void ext4_es_set_referenced(struct extent_status *es) { es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; @@ -224,17 +225,12 @@ static inline void ext4_es_store_pblock(struct extent_status *es, es->es_pblk = block; } -static inline void ext4_es_store_status(struct extent_status *es, - unsigned int status) -{ - es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | - (es->es_pblk & ~ES_MASK); -} - static inline void ext4_es_store_pblock_status(struct extent_status *es, ext4_fsblk_t pb, unsigned int status) { + WARN_ON_ONCE(!ES_TYPE_VALID(status & ES_TYPE_MASK)); + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | (pb & ~ES_MASK); } @@ -249,10 +245,9 @@ extern void ext4_exit_pending(void); extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); -extern void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, - bool allocated); -extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len); +extern void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, bool lclu_allocated, + bool end_allocated); extern void ext4_clear_inode_es(struct inode *inode); #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index b527f4ab47e0..b33664f6ce2a 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1795,7 +1795,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb, if (ret == 0) { /* Range is not mapped */ - path = ext4_find_extent(inode, cur, NULL, 0); + path = ext4_find_extent(inode, cur, path, 0); if (IS_ERR(path)) goto out; memset(&newex, 0, sizeof(newex)); @@ -1806,11 +1806,10 @@ static int ext4_fc_replay_add_range(struct super_block *sb, if (ext4_ext_is_unwritten(ex)) ext4_ext_mark_unwritten(&newex); down_write(&EXT4_I(inode)->i_data_sem); - ret = ext4_ext_insert_extent( - NULL, inode, &path, &newex, 0); + path = ext4_ext_insert_extent(NULL, inode, + path, &newex, 0); up_write((&EXT4_I(inode)->i_data_sem)); - ext4_free_ext_path(path); - if (ret) + if (IS_ERR(path)) goto out; goto next; } @@ -1835,7 +1834,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb, * at the end of the FC replay using our array of * modified inodes. */ - ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); goto next; } @@ -1859,6 +1858,7 @@ next: ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> sb->s_blocksize_bits); out: + ext4_free_ext_path(path); iput(inode); return 0; } @@ -1904,7 +1904,7 @@ ext4_fc_replay_del_range(struct super_block *sb, if (ret > 0) { remaining -= ret; cur += ret; - ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); } else { remaining -= map.m_len; cur += map.m_len; @@ -1959,22 +1959,25 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) break; if (ret > 0) { - path = ext4_find_extent(inode, map.m_lblk, NULL, 0); + path = ext4_find_extent(inode, map.m_lblk, path, 0); if (!IS_ERR(path)) { for (j = 0; j < path->p_depth; j++) ext4_mb_mark_bb(inode->i_sb, - path[j].p_block, 1, 1); - ext4_free_ext_path(path); + path[j].p_block, 1, true); + } else { + path = NULL; } cur += ret; ext4_mb_mark_bb(inode->i_sb, map.m_pblk, - map.m_len, 1); + map.m_len, true); } else { cur = cur + (map.m_len ? map.m_len : 1); } } iput(inode); } + + ext4_free_ext_path(path); } /* diff --git a/fs/ext4/file.c b/fs/ext4/file.c index c71af675e310..6c692151b0d6 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -174,7 +174,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp) (atomic_read(&inode->i_writecount) == 1) && !EXT4_I(inode)->i_reserved_data_blocks) { down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); } if (is_dx(inode) && filp->private_data) @@ -306,7 +306,7 @@ out: } static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, - ssize_t count) + ssize_t written, ssize_t count) { handle_t *handle; @@ -315,7 +315,7 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, if (IS_ERR(handle)) return PTR_ERR(handle); - if (ext4_update_inode_size(inode, offset + count)) { + if (ext4_update_inode_size(inode, offset + written)) { int ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) { ext4_journal_stop(handle); @@ -323,11 +323,11 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, } } - if (inode->i_nlink) + if ((written == count) && inode->i_nlink) ext4_orphan_del(handle, inode); ext4_journal_stop(handle); - return count; + return written; } /* @@ -393,7 +393,7 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) && pos + size <= i_size_read(inode)) return size; - return ext4_handle_inode_extension(inode, pos, size); + return ext4_handle_inode_extension(inode, pos, size, size); } static const struct iomap_dio_ops ext4_dio_write_ops = { @@ -669,7 +669,7 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); if (extend) { - ret = ext4_handle_inode_extension(inode, offset, ret); + ret = ext4_handle_inode_extension(inode, offset, ret, count); ext4_inode_extension_cleanup(inode, ret < (ssize_t)count); } out: @@ -844,8 +844,7 @@ static int ext4_sample_last_mounted(struct super_block *sb, if (err) goto out_journal; lock_buffer(sbi->s_sbh); - strncpy(sbi->s_es->s_last_mounted, cp, - sizeof(sbi->s_es->s_last_mounted)); + strtomem_pad(sbi->s_es->s_last_mounted, cp, 0); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); @@ -885,8 +884,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp) return ret; } - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | - FMODE_DIO_PARALLEL_WRITE; + filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; return dquot_file_open(inode, filp); } @@ -898,12 +896,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp) loff_t ext4_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; - loff_t maxbytes; - - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; - else - maxbytes = inode->i_sb->s_maxbytes; + loff_t maxbytes = ext4_get_maxbytes(inode); switch (whence) { default: @@ -938,7 +931,6 @@ const struct file_operations ext4_file_operations = { .compat_ioctl = ext4_compat_ioctl, #endif .mmap = ext4_file_mmap, - .mmap_supported_flags = MAP_SYNC, .open = ext4_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, @@ -946,6 +938,8 @@ const struct file_operations ext4_file_operations = { .splice_read = ext4_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, + .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | + FOP_DIO_PARALLEL_WRITE, }; const struct inode_operations ext4_file_inode_operations = { diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index 53a05b8292f0..91185c40f755 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -393,6 +393,14 @@ static unsigned int ext4_getfsmap_find_sb(struct super_block *sb, /* Reserved GDT blocks */ if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) { len = le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); + + /* + * mkfs.ext4 can set s_reserved_gdt_blocks as 0 in some cases, + * check for that. + */ + if (!len) + return 0; + error = ext4_getfsmap_fill(meta_list, fsb, len, EXT4_FMR_OWN_RESV_GDT); if (error) @@ -526,6 +534,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb, ext4_group_t end_ag; ext4_grpblk_t first_cluster; ext4_grpblk_t last_cluster; + struct ext4_fsmap irec; int error = 0; bofs = le32_to_cpu(sbi->s_es->s_first_data_block); @@ -609,10 +618,18 @@ static int ext4_getfsmap_datadev(struct super_block *sb, goto err; } - /* Report any gaps at the end of the bg */ + /* + * The dummy record below will cause ext4_getfsmap_helper() to report + * any allocated blocks at the end of the range. + */ + irec.fmr_device = 0; + irec.fmr_physical = end_fsb + 1; + irec.fmr_length = 0; + irec.fmr_owner = EXT4_FMR_OWN_FREE; + irec.fmr_flags = 0; + info->gfi_last = true; - error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster + 1, - 0, info); + error = ext4_getfsmap_helper(sb, info, &irec); if (error) goto err; @@ -628,8 +645,9 @@ static bool ext4_getfsmap_is_valid_device(struct super_block *sb, if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev)) return true; - if (EXT4_SB(sb)->s_journal_bdev && - fm->fmr_device == new_encode_dev(EXT4_SB(sb)->s_journal_bdev->bd_dev)) + if (EXT4_SB(sb)->s_journal_bdev_file && + fm->fmr_device == + new_encode_dev(file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev)) return true; return false; } @@ -699,9 +717,9 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head, memset(handlers, 0, sizeof(handlers)); handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev); handlers[0].gfd_fn = ext4_getfsmap_datadev; - if (EXT4_SB(sb)->s_journal_bdev) { + if (EXT4_SB(sb)->s_journal_bdev_file) { handlers[1].gfd_dev = new_encode_dev( - EXT4_SB(sb)->s_journal_bdev->bd_dev); + file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev); handlers[1].gfd_fn = ext4_getfsmap_logdev; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index d4d0ad689d3c..21d228073d79 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -87,10 +87,10 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) return 0; - grp = ext4_get_group_info(sb, block_group); - if (buffer_verified(bh)) return 0; + + grp = ext4_get_group_info(sb, block_group); if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) return -EFSCORRUPTED; @@ -98,8 +98,7 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, if (buffer_verified(bh)) goto verified; blk = ext4_inode_bitmap(sb, desc); - if (!ext4_inode_bitmap_csum_verify(sb, desc, bh, - EXT4_INODES_PER_GROUP(sb) / 8) || + if (!ext4_inode_bitmap_csum_verify(sb, desc, bh) || ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) { ext4_unlock_group(sb, block_group); ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " @@ -328,8 +327,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) if (percpu_counter_initialized(&sbi->s_dirs_counter)) percpu_counter_dec(&sbi->s_dirs_counter); } - ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh, - EXT4_INODES_PER_GROUP(sb) / 8); + ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); @@ -775,7 +773,7 @@ int ext4_mark_inode_used(struct super_block *sb, int ino) } gdp = ext4_get_group_desc(sb, group, &group_desc_bh); - if (!gdp || !group_desc_bh) { + if (!gdp) { err = -EINVAL; goto out; } @@ -854,8 +852,7 @@ int ext4_mark_inode_used(struct super_block *sb, int ino) ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); if (ext4_has_group_desc_csum(sb)) { - ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh, - EXT4_INODES_PER_GROUP(sb) / 8); + ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); } @@ -1065,7 +1062,6 @@ got_group: EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) goto next_group; -repeat_in_this_group: ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); if (!ret2) goto next_group; @@ -1115,8 +1111,6 @@ repeat_in_this_group: if (!ret2) goto got; /* we grabbed the inode! */ - if (ino < EXT4_INODES_PER_GROUP(sb)) - goto repeat_in_this_group; next_group: if (++group == ngroups) group = 0; @@ -1229,8 +1223,7 @@ got: } } if (ext4_has_group_desc_csum(sb)) { - ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh, - EXT4_INODES_PER_GROUP(sb) / 8); + ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); @@ -1255,8 +1248,8 @@ got: inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); - ei->i_crtime = inode->i_mtime; + simple_inode_init_ts(inode); + ei->i_crtime = inode_get_mtime(inode); memset(ei->i_data, 0, sizeof(ei->i_data)); ei->i_dir_start_lookup = 0; @@ -1341,10 +1334,7 @@ got: } } - if (ext4_handle_valid(handle)) { - ei->i_sync_tid = handle->h_transaction->t_tid; - ei->i_datasync_tid = handle->h_transaction->t_tid; - } + ext4_update_inode_fsync_trans(handle, inode, 1); err = ext4_mark_inode_dirty(handle, inode); if (err) { diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index f2c495b745f1..d45124318200 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -539,7 +539,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, int indirect_blks; int blocks_to_boundary = 0; int depth; - int count = 0; + u64 count = 0; ext4_fsblk_t first_block = 0; trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); @@ -588,7 +588,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, count++; /* Fill in size of a hole we found */ map->m_pblk = 0; - map->m_len = min_t(unsigned int, map->m_len, count); + map->m_len = umin(map->m_len, count); goto cleanup; } @@ -652,13 +652,6 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, ext4_update_inode_fsync_trans(handle, inode, 1); count = ar.len; - /* - * Update reserved blocks/metadata blocks after successful block - * allocation which had been deferred till now. - */ - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) - ext4_da_update_reserve_space(inode, count, 1); - got_it: map->m_flags |= EXT4_MAP_MAPPED; map->m_pblk = le32_to_cpu(chain[depth-1].key); @@ -714,7 +707,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode, * i_rwsem. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); *dropped = 1; return 0; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index cb65052ee3de..9fb5e0f172a7 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -298,7 +298,11 @@ static int ext4_create_inline_data(handle_t *handle, if (error) goto out; - BUG_ON(!is.s.not_found); + if (!is.s.not_found) { + EXT4_ERROR_INODE(inode, "unexpected inline data xattr"); + error = -EFSCORRUPTED; + goto out; + } error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (error) { @@ -349,7 +353,11 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode, if (error) goto out; - BUG_ON(is.s.not_found); + if (is.s.not_found) { + EXT4_ERROR_INODE(inode, "missing inline data xattr"); + error = -EFSCORRUPTED; + goto out; + } len -= EXT4_MIN_INLINE_DATA_SIZE; value = kzalloc(len, GFP_NOFS); @@ -392,7 +400,7 @@ out: } static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, - unsigned int len) + loff_t len) { int ret, size, no_expand; struct ext4_inode_info *ei = EXT4_I(inode); @@ -502,9 +510,8 @@ static int ext4_read_inline_folio(struct inode *inode, struct folio *folio) BUG_ON(len > PAGE_SIZE); kaddr = kmap_local_folio(folio, 0); ret = ext4_read_inline_data(inode, kaddr, len, &iloc); - flush_dcache_folio(folio); + kaddr = folio_zero_tail(folio, len, kaddr + len); kunmap_local(kaddr); - folio_zero_segment(folio, len, folio_size(folio)); folio_mark_uptodate(folio); brelse(iloc.bh); @@ -602,10 +609,12 @@ retry: goto out; if (ext4_should_dioread_nolock(inode)) { - ret = __block_write_begin(&folio->page, from, to, - ext4_get_block_unwritten); + ret = ext4_block_write_begin(handle, folio, from, to, + ext4_get_block_unwritten); } else - ret = __block_write_begin(&folio->page, from, to, ext4_get_block); + ret = ext4_block_write_begin(handle, folio, from, to, + ext4_get_block); + clear_buffer_new(folio_buffers(folio)); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, @@ -661,7 +670,7 @@ out_nofolio: int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, - struct page **pagep) + struct folio **foliop) { int ret; handle_t *handle; @@ -709,7 +718,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, goto out; } - *pagep = &folio->page; + *foliop = folio; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { ret = 0; @@ -857,8 +866,8 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, goto out; } - ret = __block_write_begin(&folio->page, 0, inline_size, - ext4_da_get_block_prep); + ret = ext4_block_write_begin(NULL, folio, 0, inline_size, + ext4_da_get_block_prep); if (ret) { up_read(&EXT4_I(inode)->xattr_sem); folio_unlock(folio); @@ -867,6 +876,7 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, return ret; } + clear_buffer_new(folio_buffers(folio)); folio_mark_dirty(folio); folio_mark_uptodate(folio); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); @@ -892,7 +902,7 @@ out: int ext4_da_write_inline_data_begin(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, - struct page **pagep, + struct folio **foliop, void **fsdata) { int ret; @@ -955,7 +965,7 @@ retry_journal: goto out_release_page; up_read(&EXT4_I(inode)->xattr_sem); - *pagep = &folio->page; + *foliop = folio; brelse(iloc.bh); return 1; out_release_page: @@ -1037,7 +1047,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, * happen is that the times are slightly out of date * and/or different from the directory change time. */ - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); ext4_update_dx_flag(dir); inode_inc_iversion(dir); return 1; @@ -1461,6 +1471,7 @@ int ext4_read_inline_dir(struct file *file, struct ext4_iloc iloc; void *dir_buf = NULL; int dotdot_offset, dotdot_size, extra_offset, extra_size; + struct dir_private_info *info = file->private_data; ret = ext4_get_inode_loc(inode, &iloc); if (ret) @@ -1504,12 +1515,12 @@ int ext4_read_inline_dir(struct file *file, extra_size = extra_offset + inline_size; /* - * If the version has changed since the last call to + * If the cookie has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the inline * dir to make sure. */ - if (!inode_eq_iversion(inode, file->f_version)) { + if (!inode_eq_iversion(inode, info->cookie)) { for (i = 0; i < extra_size && i < offset;) { /* * "." is with offset 0 and @@ -1541,7 +1552,7 @@ int ext4_read_inline_dir(struct file *file, } offset = i; ctx->pos = offset; - file->f_version = inode_query_iversion(inode); + info->cookie = inode_query_iversion(inode); } while (ctx->pos < extra_size) { @@ -1966,7 +1977,12 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline) if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0) goto out_error; - BUG_ON(is.s.not_found); + if (is.s.not_found) { + EXT4_ERROR_INODE(inode, + "missing inline data xattr"); + err = -EFSCORRUPTED; + goto out_error; + } value_len = le32_to_cpu(is.s.here->e_value_size); value = kmalloc(value_len, GFP_NOFS); @@ -2010,7 +2026,7 @@ out: ext4_orphan_del(handle, inode); if (err == 0) { - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err = ext4_mark_inode_dirty(handle, inode); if (IS_SYNC(inode)) ext4_handle_sync(handle); diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c index f0c0fd507fbc..749af7ad4e09 100644 --- a/fs/ext4/inode-test.c +++ b/fs/ext4/inode-test.c @@ -279,4 +279,5 @@ static struct kunit_suite ext4_inode_test_suite = { kunit_test_suites(&ext4_inode_test_suite); +MODULE_DESCRIPTION("KUnit test of ext4 inode timestamp decoding"); MODULE_LICENSE("GPL v2"); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 18ec9106c5b0..7923602271ad 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -31,6 +31,7 @@ #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/mpage.h> +#include <linux/rmap.h> #include <linux/namei.h> #include <linux/uio.h> #include <linux/bio.h> @@ -49,6 +50,11 @@ #include <trace/events/ext4.h> +static void ext4_journalled_zero_new_buffers(handle_t *handle, + struct inode *inode, + struct folio *folio, + unsigned from, unsigned to); + static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { @@ -145,7 +151,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, */ int ext4_inode_is_fast_symlink(struct inode *inode) { - if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + if (!ext4_has_feature_ea_inode(inode->i_sb)) { int ea_blocks = EXT4_I(inode)->i_file_acl ? EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; @@ -371,17 +377,18 @@ void ext4_da_update_reserve_space(struct inode *inode, */ if ((ei->i_reserved_data_blocks == 0) && !inode_is_open_for_write(inode)) - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); } static int __check_block_validity(struct inode *inode, const char *func, unsigned int line, struct ext4_map_blocks *map) { - if (ext4_has_feature_journal(inode->i_sb) && - (inode->i_ino == - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + + if (journal && inode == journal->j_inode) return 0; + if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) { ext4_error_inode(inode, func, line, map->m_pblk, "lblock %lu mapped to illegal pblock %llu " @@ -478,7 +485,89 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); + map->m_pblk, status, 0); + return retval; +} + +static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) +{ + struct extent_status es; + unsigned int status; + int err, retval = 0; + + /* + * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE + * indicates that the blocks and quotas has already been + * checked when the data was copied into the page cache. + */ + if (map->m_flags & EXT4_MAP_DELAYED) + flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; + + /* + * Here we clear m_flags because after allocating an new extent, + * it will be set again. + */ + map->m_flags &= ~EXT4_MAP_FLAGS; + + /* + * We need to check for EXT4 here because migrate could have + * changed the inode type in between. + */ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, flags); + } else { + retval = ext4_ind_map_blocks(handle, inode, map, flags); + + /* + * We allocated new blocks which will result in i_data's + * format changing. Force the migrate to fail by clearing + * migrate flags. + */ + if (retval > 0 && map->m_flags & EXT4_MAP_NEW) + ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); + } + if (retval <= 0) + return retval; + + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode %lu: " + "retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); + } + + /* + * We have to zeroout blocks before inserting them into extent + * status tree. Otherwise someone could look them up there and + * use them before they are really zeroed. We also have to + * unmap metadata before zeroing as otherwise writeback can + * overwrite zeros with stale data from block device. + */ + if (flags & EXT4_GET_BLOCKS_ZERO && + map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { + err = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, + map->m_len); + if (err) + return err; + } + + /* + * If the extent has been zeroed out, we don't need to update + * extent status tree. + */ + if (flags & EXT4_GET_BLOCKS_PRE_IO && + ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + if (ext4_es_is_written(&es)) + return retval; + } + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, flags); + return retval; } @@ -494,9 +583,10 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * - * On success, it returns the number of blocks being mapped or allocated. if - * create==0 and the blocks are pre-allocated and unwritten, the resulting @map - * is marked as unwritten. If the create == 1, it will mark @map as mapped. + * On success, it returns the number of blocks being mapped or allocated. + * If flags doesn't contain EXT4_GET_BLOCKS_CREATE the blocks are + * pre-allocated and unwritten, the resulting @map is marked as unwritten. + * If the flags contain EXT4_GET_BLOCKS_CREATE, it will mark @map as mapped. * * It returns 0 if plain look up failed (blocks have not been allocated), in * that case, @map is returned as unmapped but we still do fill map->m_len to @@ -544,6 +634,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, map->m_len = retval; } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { map->m_pblk = 0; + map->m_flags |= ext4_es_is_delayed(&es) ? + EXT4_MAP_DELAYED : 0; retval = es.es_len - (map->m_lblk - es.es_lblk); if (retval > map->m_len) retval = map->m_len; @@ -573,32 +665,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - retval = ext4_ext_map_blocks(handle, inode, map, 0); - } else { - retval = ext4_ind_map_blocks(handle, inode, map, 0); - } - if (retval > 0) { - unsigned int status; - - if (unlikely(retval != map->m_len)) { - ext4_warning(inode->i_sb, - "ES len assertion failed for inode " - "%lu: retval %d != map->m_len %d", - inode->i_ino, retval, map->m_len); - WARN_ON(1); - } - - status = map->m_flags & EXT4_MAP_UNWRITTEN ? - EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && - !(status & EXTENT_STATUS_WRITTEN) && - ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, - map->m_lblk + map->m_len - 1)) - status |= EXTENT_STATUS_DELAYED; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); - } + retval = ext4_map_query_blocks(handle, inode, map); up_read((&EXT4_I(inode)->i_data_sem)); found: @@ -616,8 +683,7 @@ found: * Returns if the blocks have already allocated * * Note that if blocks have been preallocated - * ext4_ext_get_block() returns the create = 0 - * with buffer head unmapped. + * ext4_ext_map_blocks() returns with buffer head unmapped */ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) /* @@ -629,88 +695,13 @@ found: return retval; /* - * Here we clear m_flags because after allocating an new extent, - * it will be set again. - */ - map->m_flags &= ~EXT4_MAP_FLAGS; - - /* * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take * the write lock of i_data_sem, and call get_block() * with create == 1 flag. */ down_write(&EXT4_I(inode)->i_data_sem); - - /* - * We need to check for EXT4 here because migrate - * could have changed the inode type in between - */ - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - retval = ext4_ext_map_blocks(handle, inode, map, flags); - } else { - retval = ext4_ind_map_blocks(handle, inode, map, flags); - - if (retval > 0 && map->m_flags & EXT4_MAP_NEW) { - /* - * We allocated new blocks which will result in - * i_data's format changing. Force the migrate - * to fail by clearing migrate flags - */ - ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); - } - } - - if (retval > 0) { - unsigned int status; - - if (unlikely(retval != map->m_len)) { - ext4_warning(inode->i_sb, - "ES len assertion failed for inode " - "%lu: retval %d != map->m_len %d", - inode->i_ino, retval, map->m_len); - WARN_ON(1); - } - - /* - * We have to zeroout blocks before inserting them into extent - * status tree. Otherwise someone could look them up there and - * use them before they are really zeroed. We also have to - * unmap metadata before zeroing as otherwise writeback can - * overwrite zeros with stale data from block device. - */ - if (flags & EXT4_GET_BLOCKS_ZERO && - map->m_flags & EXT4_MAP_MAPPED && - map->m_flags & EXT4_MAP_NEW) { - ret = ext4_issue_zeroout(inode, map->m_lblk, - map->m_pblk, map->m_len); - if (ret) { - retval = ret; - goto out_sem; - } - } - - /* - * If the extent has been zeroed out, we don't need to update - * extent status tree. - */ - if ((flags & EXT4_GET_BLOCKS_PRE_IO) && - ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { - if (ext4_es_is_written(&es)) - goto out_sem; - } - status = map->m_flags & EXT4_MAP_UNWRITTEN ? - EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && - !(status & EXTENT_STATUS_WRITTEN) && - ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, - map->m_lblk + map->m_len - 1)) - status |= EXTENT_STATUS_DELAYED; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); - } - -out_sem: + retval = ext4_map_create_blocks(handle, inode, map, flags); up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); @@ -1009,39 +1000,28 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, */ static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) { - folio_mark_dirty(bh->b_folio); + struct folio *folio = bh->b_folio; + struct inode *inode = folio->mapping->host; + + /* only regular files have a_ops */ + if (S_ISREG(inode->i_mode)) + folio_mark_dirty(folio); return ext4_handle_dirty_metadata(handle, NULL, bh); } int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { - int dirty = buffer_dirty(bh); - int ret; - if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; - /* - * __block_write_begin() could have dirtied some buffers. Clean - * the dirty bit as jbd2_journal_get_write_access() could complain - * otherwise about fs integrity issues. Setting of the dirty bit - * by __block_write_begin() isn't a real problem here as we clear - * the bit before releasing a page lock and thus writeback cannot - * ever write the buffer. - */ - if (dirty) - clear_buffer_dirty(bh); BUFFER_TRACE(bh, "get write access"); - ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, + return ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); - if (!ret && dirty) - ret = ext4_dirty_journalled_data(handle, bh); - return ret; } -#ifdef CONFIG_FS_ENCRYPTION -static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, - get_block_t *get_block) +int ext4_block_write_begin(handle_t *handle, struct folio *folio, + loff_t pos, unsigned len, + get_block_t *get_block) { unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; @@ -1054,6 +1034,7 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, struct buffer_head *bh, *head, *wait[2]; int nr_wait = 0; int i; + bool should_journal_data = ext4_should_journal_data(inode); BUG_ON(!folio_test_locked(folio)); BUG_ON(from > PAGE_SIZE); @@ -1061,10 +1042,8 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, BUG_ON(from > to); head = folio_buffers(folio); - if (!head) { - create_empty_buffers(&folio->page, blocksize, 0); - head = folio_buffers(folio); - } + if (!head) + head = create_empty_buffers(folio, blocksize, 0); bbits = ilog2(blocksize); block = (sector_t)folio->index << (PAGE_SHIFT - bbits); @@ -1077,7 +1056,7 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, } continue; } - if (buffer_new(bh)) + if (WARN_ON_ONCE(buffer_new(bh))) clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); @@ -1085,10 +1064,22 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, if (err) break; if (buffer_new(bh)) { + /* + * We may be zeroing partial buffers or all new + * buffers in case of failure. Prepare JBD2 for + * that. + */ + if (should_journal_data) + do_journal_get_write_access(handle, + inode, bh); if (folio_test_uptodate(folio)) { - clear_buffer_new(bh); + /* + * Unlike __block_write_begin() we leave + * dirtying of new uptodate buffers to + * ->write_end() time or + * folio_zero_new_buffers(). + */ set_buffer_uptodate(bh); - mark_buffer_dirty(bh); continue; } if (block_end > to || block_start < from) @@ -1118,7 +1109,11 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, err = -EIO; } if (unlikely(err)) { - folio_zero_new_buffers(folio, from, to); + if (should_journal_data) + ext4_journalled_zero_new_buffers(handle, inode, folio, + from, to); + else + folio_zero_new_buffers(folio, from, to); } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { for (i = 0; i < nr_wait; i++) { int err2; @@ -1134,7 +1129,6 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, return err; } -#endif /* * To preserve ordering, it is essential that the hole instantiation and @@ -1145,7 +1139,7 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, */ static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; int ret, needed_blocks; @@ -1170,7 +1164,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, - pagep); + foliop); if (ret < 0) return ret; if (ret == 1) @@ -1194,7 +1188,7 @@ retry_grab: * starting the handle. */ if (!folio_buffers(folio)) - create_empty_buffers(&folio->page, inode->i_sb->s_blocksize, 0); + create_empty_buffers(folio, inode->i_sb->s_blocksize, 0); folio_unlock(folio); @@ -1216,19 +1210,12 @@ retry_journal: /* In case writeback began while the folio was unlocked */ folio_wait_stable(folio); -#ifdef CONFIG_FS_ENCRYPTION if (ext4_should_dioread_nolock(inode)) - ret = ext4_block_write_begin(folio, pos, len, + ret = ext4_block_write_begin(handle, folio, pos, len, ext4_get_block_unwritten); else - ret = ext4_block_write_begin(folio, pos, len, ext4_get_block); -#else - if (ext4_should_dioread_nolock(inode)) - ret = __block_write_begin(&folio->page, pos, len, - ext4_get_block_unwritten); - else - ret = __block_write_begin(&folio->page, pos, len, ext4_get_block); -#endif + ret = ext4_block_write_begin(handle, folio, pos, len, + ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, to, @@ -1241,7 +1228,7 @@ retry_journal: folio_unlock(folio); /* - * __block_write_begin may have instantiated a few blocks + * ext4_block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold i_rwsem. * @@ -1270,7 +1257,7 @@ retry_journal: folio_put(folio); return ret; } - *pagep = &folio->page; + *foliop = folio; return ret; } @@ -1285,6 +1272,7 @@ static int write_end_fn(handle_t *handle, struct inode *inode, ret = ext4_dirty_journalled_data(handle, bh); clear_buffer_meta(bh); clear_buffer_prio(bh); + clear_buffer_new(bh); return ret; } @@ -1292,15 +1280,14 @@ static int write_end_fn(handle_t *handle, struct inode *inode, * We need to pick up the new inode size which generic_commit_write gave us * `file' can be NULL - eg, when called from page_symlink(). * - * ext4 never places buffers on inode->i_mapping->private_list. metadata + * ext4 never places buffers on inode->i_mapping->i_private_list. metadata * buffers are managed internally. */ static int ext4_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -1315,7 +1302,7 @@ static int ext4_write_end(struct file *file, return ext4_write_inline_data_end(inode, pos, len, copied, folio); - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata); /* * it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. @@ -1328,8 +1315,10 @@ static int ext4_write_end(struct file *file, folio_unlock(folio); folio_put(folio); - if (old_size < pos && !verity) + if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); + ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); + } /* * Don't mark the inode dirty under folio lock. First, it unnecessarily * makes the holding time of folio lock longer. Second, it forces lock @@ -1389,9 +1378,9 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, size = min(to, block_end) - start; folio_zero_range(folio, start, size); - write_end_fn(handle, inode, bh); } clear_buffer_new(bh); + write_end_fn(handle, inode, bh); } } block_start = block_end; @@ -1402,9 +1391,8 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, static int ext4_journalled_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -1445,8 +1433,10 @@ static int ext4_journalled_write_end(struct file *file, folio_unlock(folio); folio_put(folio); - if (old_size < pos && !verity) + if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); + ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); + } if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); @@ -1479,9 +1469,9 @@ static int ext4_journalled_write_end(struct file *file, } /* - * Reserve space for a single cluster + * Reserve space for 'nr_resv' clusters */ -static int ext4_da_reserve_space(struct inode *inode) +static int ext4_da_reserve_space(struct inode *inode, int nr_resv) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); @@ -1492,18 +1482,18 @@ static int ext4_da_reserve_space(struct inode *inode) * us from metadata over-estimation, though we may go over by * a small amount in the end. Here we just reserve for data. */ - ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1)); + ret = dquot_reserve_block(inode, EXT4_C2B(sbi, nr_resv)); if (ret) return ret; spin_lock(&ei->i_block_reservation_lock); - if (ext4_claim_free_clusters(sbi, 1, 0)) { + if (ext4_claim_free_clusters(sbi, nr_resv, 0)) { spin_unlock(&ei->i_block_reservation_lock); - dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); + dquot_release_reservation_block(inode, EXT4_C2B(sbi, nr_resv)); return -ENOSPC; } - ei->i_reserved_data_blocks++; - trace_ext4_da_reserve_space(inode); + ei->i_reserved_data_blocks += nr_resv; + trace_ext4_da_reserve_space(inode, nr_resv); spin_unlock(&ei->i_block_reservation_lock); return 0; /* success */ @@ -1650,24 +1640,58 @@ static void ext4_print_free_blocks(struct inode *inode) } /* - * ext4_insert_delayed_block - adds a delayed block to the extents status - * tree, incrementing the reserved cluster/block - * count or making a pending reservation - * where needed + * Check whether the cluster containing lblk has been allocated or has + * delalloc reservation. + * + * Returns 0 if the cluster doesn't have either, 1 if it has delalloc + * reservation, 2 if it's already been allocated, negative error code on + * failure. + */ +static int ext4_clu_alloc_state(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int ret; + + /* Has delalloc reservation? */ + if (ext4_es_scan_clu(inode, &ext4_es_is_delayed, lblk)) + return 1; + + /* Already been allocated? */ + if (ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk)) + return 2; + ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk)); + if (ret < 0) + return ret; + if (ret > 0) + return 2; + + return 0; +} + +/* + * ext4_insert_delayed_blocks - adds a multiple delayed blocks to the extents + * status tree, incrementing the reserved + * cluster/block count or making pending + * reservations where needed * * @inode - file containing the newly added block - * @lblk - logical block to be added + * @lblk - start logical block to be added + * @len - length of blocks to be added * * Returns 0 on success, negative error code on failure. */ -static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) +static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; - bool allocated = false; + bool lclu_allocated = false; + bool end_allocated = false; + ext4_lblk_t resv_clu; + ext4_lblk_t end = lblk + len - 1; /* - * If the cluster containing lblk is shared with a delayed, + * If the cluster containing lblk or end is shared with a delayed, * written, or unwritten extent in a bigalloc file system, it's * already been accounted for and does not need to be reserved. * A pending reservation must be made for the cluster if it's @@ -1678,62 +1702,70 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) * extents status tree doesn't get a match. */ if (sbi->s_cluster_ratio == 1) { - ret = ext4_da_reserve_space(inode); + ret = ext4_da_reserve_space(inode, len); if (ret != 0) /* ENOSPC */ return ret; } else { /* bigalloc */ - if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { - if (!ext4_es_scan_clu(inode, - &ext4_es_is_mapped, lblk)) { - ret = ext4_clu_mapped(inode, - EXT4_B2C(sbi, lblk)); - if (ret < 0) - return ret; - if (ret == 0) { - ret = ext4_da_reserve_space(inode); - if (ret != 0) /* ENOSPC */ - return ret; - } else { - allocated = true; - } - } else { - allocated = true; + resv_clu = EXT4_B2C(sbi, end) - EXT4_B2C(sbi, lblk) + 1; + + ret = ext4_clu_alloc_state(inode, lblk); + if (ret < 0) + return ret; + if (ret > 0) { + resv_clu--; + lclu_allocated = (ret == 2); + } + + if (EXT4_B2C(sbi, lblk) != EXT4_B2C(sbi, end)) { + ret = ext4_clu_alloc_state(inode, end); + if (ret < 0) + return ret; + if (ret > 0) { + resv_clu--; + end_allocated = (ret == 2); } } + + if (resv_clu) { + ret = ext4_da_reserve_space(inode, resv_clu); + if (ret != 0) /* ENOSPC */ + return ret; + } } - ext4_es_insert_delayed_block(inode, lblk, allocated); + ext4_es_insert_delayed_extent(inode, lblk, len, lclu_allocated, + end_allocated); return 0; } /* - * This function is grabs code from the very beginning of - * ext4_map_blocks, but assumes that the caller is from delayed write - * time. This function looks up the requested blocks and sets the - * buffer delay bit under the protection of i_data_sem. + * Looks up the requested blocks and sets the delalloc extent map. + * First try to look up for the extent entry that contains the requested + * blocks in the extent status tree without i_data_sem, then try to look + * up for the ondisk extent mapping with i_data_sem in read mode, + * finally hold i_data_sem in write mode, looks up again and add a + * delalloc extent entry if it still couldn't find any extent. Pass out + * the mapped extent through @map and return 0 on success. */ -static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, - struct ext4_map_blocks *map, - struct buffer_head *bh) +static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map) { struct extent_status es; int retval; - sector_t invalid_block = ~((sector_t) 0xffff); #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; memcpy(&orig_map, map, sizeof(*map)); #endif - if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) - invalid_block = ~0; - map->m_flags = 0; ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, (unsigned long) map->m_lblk); /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + map->m_len = min_t(unsigned int, map->m_len, + es.es_len - (map->m_lblk - es.es_lblk)); + if (ext4_es_is_hole(&es)) goto add_delayed; @@ -1742,18 +1774,12 @@ found: * Delayed extent could be allocated by fallocate. * So we need to check it. */ - if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) { - map_bh(bh, inode->i_sb, invalid_block); - set_buffer_new(bh); - set_buffer_delay(bh); + if (ext4_es_is_delayed(&es)) { + map->m_flags |= EXT4_MAP_DELAYED; return 0; } - map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk; - retval = es.es_len - (iblock - es.es_lblk); - if (retval > map->m_len) - retval = map->m_len; - map->m_len = retval; + map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; if (ext4_es_is_written(&es)) map->m_flags |= EXT4_MAP_MAPPED; else if (ext4_es_is_unwritten(&es)) @@ -1764,7 +1790,7 @@ found: #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); #endif - return retval; + return 0; } /* @@ -1778,7 +1804,7 @@ found: retval = ext4_map_query_blocks(NULL, inode, map); up_read(&EXT4_I(inode)->i_data_sem); if (retval) - return retval; + return retval < 0 ? retval : 0; add_delayed: down_write(&EXT4_I(inode)->i_data_sem); @@ -1789,7 +1815,10 @@ add_delayed: * is held in write mode, before inserting a new da entry in * the extent status tree. */ - if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + map->m_len = min_t(unsigned int, map->m_len, + es.es_len - (map->m_lblk - es.es_lblk)); + if (!ext4_es_is_hole(&es)) { up_write(&EXT4_I(inode)->i_data_sem); goto found; @@ -1798,18 +1827,14 @@ add_delayed: retval = ext4_map_query_blocks(NULL, inode, map); if (retval) { up_write(&EXT4_I(inode)->i_data_sem); - return retval; + return retval < 0 ? retval : 0; } } - retval = ext4_insert_delayed_block(inode, map->m_lblk); + map->m_flags |= EXT4_MAP_DELAYED; + retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len); up_write(&EXT4_I(inode)->i_data_sem); - if (retval) - return retval; - map_bh(bh, inode->i_sb, invalid_block); - set_buffer_new(bh); - set_buffer_delay(bh); return retval; } @@ -1829,11 +1854,15 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { struct ext4_map_blocks map; + sector_t invalid_block = ~((sector_t) 0xffff); int ret = 0; BUG_ON(create == 0); BUG_ON(bh->b_size != inode->i_sb->s_blocksize); + if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) + invalid_block = ~0; + map.m_lblk = iblock; map.m_len = 1; @@ -1842,10 +1871,17 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ - ret = ext4_da_map_blocks(inode, iblock, &map, bh); - if (ret <= 0) + ret = ext4_da_map_blocks(inode, &map); + if (ret < 0) return ret; + if (map.m_flags & EXT4_MAP_DELAYED) { + map_bh(bh, inode->i_sb, invalid_block); + set_buffer_new(bh); + set_buffer_delay(bh); + return 0; + } + map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); @@ -1893,7 +1929,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) len = folio_size(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(mpd->inode)) - len = size & ~PAGE_MASK; + len = size & (len - 1); err = ext4_bio_write_folio(&mpd->io_submit, folio, len); if (!err) mpd->wbc->nr_to_write--; @@ -2173,11 +2209,6 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) * writeback and there is nothing we can do about it so it might result * in data loss. So use reserved blocks to allocate metadata if * possible. - * - * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if - * the blocks in question are delalloc blocks. This indicates - * that the blocks and quotas has already been checked when - * the data was copied into the page cache. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL | @@ -2185,8 +2216,6 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; - if (map->m_flags & BIT(BH_Delay)) - get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; err = ext4_map_blocks(handle, inode, map, get_blocks_flags); if (err < 0) @@ -2880,7 +2909,7 @@ static int ext4_nonda_switch(struct super_block *sb) static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret, retries = 0; struct folio *folio; @@ -2895,14 +2924,14 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(file, mapping, pos, - len, pagep, fsdata); + len, foliop, fsdata); } *fsdata = (void *)0; trace_ext4_da_write_begin(inode, pos, len); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len, - pagep, fsdata); + foliop, fsdata); if (ret < 0) return ret; if (ret == 1) @@ -2915,11 +2944,8 @@ retry: if (IS_ERR(folio)) return PTR_ERR(folio); -#ifdef CONFIG_FS_ENCRYPTION - ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep); -#else - ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep); -#endif + ret = ext4_block_write_begin(NULL, folio, pos, len, + ext4_da_get_block_prep); if (ret < 0) { folio_unlock(folio); folio_put(folio); @@ -2937,7 +2963,7 @@ retry: return ret; } - *pagep = &folio->page; + *foliop = folio; return ret; } @@ -2971,7 +2997,8 @@ static int ext4_da_do_write_end(struct address_space *mapping, struct inode *inode = mapping->host; loff_t old_size = inode->i_size; bool disksize_changed = false; - loff_t new_i_size; + loff_t new_i_size, zero_len = 0; + handle_t *handle; if (unlikely(!folio_buffers(folio))) { folio_unlock(folio); @@ -2983,7 +3010,7 @@ static int ext4_da_do_write_end(struct address_space *mapping, * flag, which all that's needed to trigger page writeback. */ copied = block_write_end(NULL, mapping, pos, len, copied, - &folio->page, NULL); + folio, NULL); new_i_size = pos + copied; /* @@ -3015,18 +3042,21 @@ static int ext4_da_do_write_end(struct address_space *mapping, folio_unlock(folio); folio_put(folio); - if (old_size < pos) + if (pos > old_size) { pagecache_isize_extended(inode, old_size, pos); + zero_len = pos - old_size; + } - if (disksize_changed) { - handle_t *handle; + if (!disksize_changed && !zero_len) + return copied; - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) - return PTR_ERR(handle); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); - } + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + if (zero_len) + ext4_zero_partial_blocks(handle, inode, old_size, zero_len); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); return copied; } @@ -3034,15 +3064,14 @@ static int ext4_da_do_write_end(struct address_space *mapping, static int ext4_da_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int write_mode = (int)(unsigned long)fsdata; - struct folio *folio = page_folio(page); if (write_mode == FALL_BACK_TO_NONDELALLOC) return ext4_write_end(file, mapping, pos, - len, copied, &folio->page, fsdata); + len, copied, folio, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); @@ -3238,7 +3267,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) } /* Any metadata buffers to write? */ - if (!list_empty(&inode->i_mapping->private_list)) + if (!list_empty(&inode->i_mapping->i_private_list)) return true; return inode->i_state & I_DIRTY_DATASYNC; } @@ -3292,6 +3321,9 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, iomap->addr = (u64) map->m_pblk << blkbits; if (flags & IOMAP_DAX) iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; + } else if (map->m_flags & EXT4_MAP_DELAYED) { + iomap->type = IOMAP_DELALLOC; + iomap->addr = IOMAP_NULL_ADDR; } else { iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; @@ -3454,35 +3486,11 @@ const struct iomap_ops ext4_iomap_overwrite_ops = { .iomap_end = ext4_iomap_end, }; -static bool ext4_iomap_is_delalloc(struct inode *inode, - struct ext4_map_blocks *map) -{ - struct extent_status es; - ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1; - - ext4_es_find_extent_range(inode, &ext4_es_is_delayed, - map->m_lblk, end, &es); - - if (!es.es_len || es.es_lblk > end) - return false; - - if (es.es_lblk > map->m_lblk) { - map->m_len = es.es_lblk - map->m_lblk; - return false; - } - - offset = map->m_lblk - es.es_lblk; - map->m_len = es.es_len - offset; - - return true; -} - static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; - bool delalloc = false; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; @@ -3523,13 +3531,8 @@ static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) return ret; - if (ret == 0) - delalloc = ext4_iomap_is_delalloc(inode, &map); - set_iomap: ext4_set_iomap(inode, iomap, &map, offset, length, flags); - if (delalloc && iomap->type == IOMAP_HOLE) - iomap->type = IOMAP_DELALLOC; return 0; } @@ -3586,10 +3589,9 @@ static const struct address_space_operations ext4_aops = { .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, - .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; @@ -3603,10 +3605,9 @@ static const struct address_space_operations ext4_journalled_aops = { .bmap = ext4_bmap, .invalidate_folio = ext4_journalled_invalidate_folio, .release_folio = ext4_release_folio, - .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; @@ -3620,16 +3621,14 @@ static const struct address_space_operations ext4_da_aops = { .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, - .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, - .direct_IO = noop_direct_IO, .dirty_folio = noop_dirty_folio, .bmap = ext4_bmap, .swap_activate = ext4_iomap_swap_activate, @@ -3655,6 +3654,12 @@ void ext4_set_aops(struct inode *inode) inode->i_mapping->a_ops = &ext4_aops; } +/* + * Here we can't skip an unwritten buffer even though it usually reads zero + * because it might have data in pagecache (eg, if called from ext4_zero_range, + * ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a + * racing writeback can come later and flush the stale pagecache to disk. + */ static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { @@ -3678,10 +3683,8 @@ static int __ext4_block_zero_page_range(handle_t *handle, iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); bh = folio_buffers(folio); - if (!bh) { - create_empty_buffers(&folio->page, blocksize, 0); - bh = folio_buffers(folio); - } + if (!bh) + bh = create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ pos = blocksize; @@ -3883,6 +3886,68 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, return ret; } +static inline void ext4_truncate_folio(struct inode *inode, + loff_t start, loff_t end) +{ + unsigned long blocksize = i_blocksize(inode); + struct folio *folio; + + /* Nothing to be done if no complete block needs to be truncated. */ + if (round_up(start, blocksize) >= round_down(end, blocksize)) + return; + + folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT); + if (IS_ERR(folio)) + return; + + if (folio_mkclean(folio)) + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); +} + +int ext4_truncate_page_cache_block_range(struct inode *inode, + loff_t start, loff_t end) +{ + unsigned long blocksize = i_blocksize(inode); + int ret; + + /* + * For journalled data we need to write (and checkpoint) pages + * before discarding page cache to avoid inconsitent data on disk + * in case of crash before freeing or unwritten converting trans + * is committed. + */ + if (ext4_should_journal_data(inode)) { + ret = filemap_write_and_wait_range(inode->i_mapping, start, + end - 1); + if (ret) + return ret; + goto truncate_pagecache; + } + + /* + * If the block size is less than the page size, the file's mapped + * blocks within one page could be freed or converted to unwritten. + * So it's necessary to remove writable userspace mappings, and then + * ext4_page_mkwrite() can be called during subsequent write access + * to these partial folios. + */ + if (!IS_ALIGNED(start | end, PAGE_SIZE) && + blocksize < PAGE_SIZE && start < inode->i_size) { + loff_t page_boundary = round_up(start, PAGE_SIZE); + + ext4_truncate_folio(inode, start, min(page_boundary, end)); + if (end > page_boundary) + ext4_truncate_folio(inode, + round_down(end, PAGE_SIZE), end); + } + +truncate_pagecache: + truncate_pagecache_range(inode, start, end - 1); + return 0; +} + static void ext4_wait_dax_page(struct inode *inode) { filemap_invalidate_unlock(inode->i_mapping); @@ -3927,91 +3992,56 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; - ext4_lblk_t first_block, stop_block; - struct address_space *mapping = inode->i_mapping; - loff_t first_block_offset, last_block_offset, max_length; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t start_lblk, end_lblk; + loff_t max_end = sb->s_maxbytes; + loff_t end = offset + length; handle_t *handle; unsigned int credits; - int ret = 0, ret2 = 0; + int ret; trace_ext4_punch_hole(inode, offset, length, 0); + WARN_ON_ONCE(!inode_is_locked(inode)); /* - * Write out all dirty pages to avoid race conditions - * Then release them. + * For indirect-block based inodes, make sure that the hole within + * one block before last range. */ - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - ret = filemap_write_and_wait_range(mapping, offset, - offset + length - 1); - if (ret) - return ret; - } - - inode_lock(inode); + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize; /* No need to punch hole beyond i_size */ - if (offset >= inode->i_size) - goto out_mutex; + if (offset >= inode->i_size || offset >= max_end) + return 0; /* - * If the hole extends beyond i_size, set the hole - * to end after the page that contains i_size + * If the hole extends beyond i_size, set the hole to end after + * the page that contains i_size. */ - if (offset + length > inode->i_size) { - length = inode->i_size + - PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) - - offset; - } + if (end > inode->i_size) + end = round_up(inode->i_size, PAGE_SIZE); + if (end > max_end) + end = max_end; + length = end - offset; /* - * For punch hole the length + offset needs to be within one block - * before last range. Adjust the length if it goes beyond that limit. + * Attach jinode to inode for jbd2 if we do any zeroing of partial + * block. */ - max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize; - if (offset + length > max_length) - length = max_length - offset; - - if (offset & (sb->s_blocksize - 1) || - (offset + length) & (sb->s_blocksize - 1)) { - /* - * Attach jinode to inode for jbd2 if we do any zeroing of - * partial block - */ + if (!IS_ALIGNED(offset | end, sb->s_blocksize)) { ret = ext4_inode_attach_jinode(inode); if (ret < 0) - goto out_mutex; - + return ret; } - /* Wait all existing dio workers, newcomers will block on i_rwsem */ - inode_dio_wait(inode); - - ret = file_modified(file); - if (ret) - goto out_mutex; - - /* - * Prevent page faults from reinstantiating pages we have released from - * page cache. - */ - filemap_invalidate_lock(mapping); - ret = ext4_break_layouts(inode); + ret = ext4_update_disksize_before_punch(inode, offset, length); if (ret) - goto out_dio; - - first_block_offset = round_up(offset, sb->s_blocksize); - last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; + return ret; /* Now release the pages and zero block aligned part of pages*/ - if (last_block_offset > first_block_offset) { - ret = ext4_update_disksize_before_punch(inode, offset, length); - if (ret) - goto out_dio; - truncate_pagecache_range(inode, first_block_offset, - last_block_offset); - } + ret = ext4_truncate_page_cache_block_range(inode, offset, end); + if (ret) + return ret; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); @@ -4021,52 +4051,51 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(sb, ret); - goto out_dio; + return ret; } - ret = ext4_zero_partial_blocks(handle, inode, offset, - length); + ret = ext4_zero_partial_blocks(handle, inode, offset, length); if (ret) - goto out_stop; - - first_block = (offset + sb->s_blocksize - 1) >> - EXT4_BLOCK_SIZE_BITS(sb); - stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); + goto out_handle; /* If there are blocks to remove, do it */ - if (stop_block > first_block) { + start_lblk = EXT4_B_TO_LBLK(inode, offset); + end_lblk = end >> inode->i_blkbits; + + if (end_lblk > start_lblk) { + ext4_lblk_t hole_len = end_lblk - start_lblk; down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); - ext4_es_remove_extent(inode, first_block, - stop_block - first_block); + ext4_es_remove_extent(inode, start_lblk, hole_len); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ret = ext4_ext_remove_space(inode, first_block, - stop_block - 1); + ret = ext4_ext_remove_space(inode, start_lblk, + end_lblk - 1); else - ret = ext4_ind_remove_space(handle, inode, first_block, - stop_block); + ret = ext4_ind_remove_space(handle, inode, start_lblk, + end_lblk); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_handle; + } + ext4_es_insert_extent(inode, start_lblk, hole_len, ~0, + EXTENT_STATUS_HOLE, 0); up_write(&EXT4_I(inode)->i_data_sem); } - ext4_fc_track_range(handle, inode, first_block, stop_block); + ext4_fc_track_range(handle, inode, start_lblk, end_lblk); + + ret = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret)) + goto out_handle; + + ext4_update_inode_fsync_trans(handle, inode, 1); if (IS_SYNC(inode)) ext4_handle_sync(handle); - - inode->i_mtime = inode_set_ctime_current(inode); - ret2 = ext4_mark_inode_dirty(handle, inode); - if (unlikely(ret2)) - ret = ret2; - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); -out_stop: +out_handle: ext4_journal_stop(handle); -out_dio: - filemap_invalidate_unlock(mapping); -out_mutex: - inode_unlock(inode); return ret; } @@ -4190,7 +4219,7 @@ int ext4_truncate(struct inode *inode) down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) err = ext4_ext_truncate(handle, inode); @@ -4215,7 +4244,7 @@ out_stop: if (inode->i_nlink) ext4_orphan_del(handle, inode); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(err2 && !err)) err = err2; @@ -4319,8 +4348,8 @@ static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); EXT4_INODE_SET_CTIME(inode, raw_inode); - EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); - EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); + EXT4_INODE_SET_MTIME(inode, raw_inode); + EXT4_INODE_SET_ATIME(inode, raw_inode); EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); @@ -4684,22 +4713,43 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) inode_set_iversion_queried(inode, val); } -static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags) - +static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, + const char *function, unsigned int line) { + const char *err_str; + if (flags & EXT4_IGET_EA_INODE) { - if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) - return "missing EA_INODE flag"; + if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + err_str = "missing EA_INODE flag"; + goto error; + } if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) || - EXT4_I(inode)->i_file_acl) - return "ea_inode with extended attributes"; + EXT4_I(inode)->i_file_acl) { + err_str = "ea_inode with extended attributes"; + goto error; + } } else { - if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) - return "unexpected EA_INODE flag"; + if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + /* + * open_by_handle_at() could provide an old inode number + * that has since been reused for an ea_inode; this does + * not indicate filesystem corruption + */ + if (flags & EXT4_IGET_HANDLE) + return -ESTALE; + err_str = "unexpected EA_INODE flag"; + goto error; + } + } + if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) { + err_str = "unexpected bad inode w/o EXT4_IGET_BAD"; + goto error; } - if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) - return "unexpected bad inode w/o EXT4_IGET_BAD"; - return NULL; + return 0; + +error: + ext4_error_inode(inode, function, line, 0, err_str); + return -EFSCORRUPTED; } struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, @@ -4711,7 +4761,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, struct ext4_inode_info *ei; struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct inode *inode; - const char *err_str; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; loff_t size; @@ -4740,10 +4789,10 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { - if ((err_str = check_igot_inode(inode, flags)) != NULL) { - ext4_error_inode(inode, function, line, 0, err_str); + ret = check_igot_inode(inode, flags, function, line); + if (ret) { iput(inode); - return ERR_PTR(-EFSCORRUPTED); + return ERR_PTR(ret); } return inode; } @@ -4855,7 +4904,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(sb, raw_inode); - if ((size = i_size_read(inode)) < 0) { + size = i_size_read(inode); + if (size < 0 || size > ext4_get_maxbytes(inode)) { ext4_error_inode(inode, function, line, 0, "iget: bad i_size value: %lld", size); ret = -EFSCORRUPTED; @@ -4928,8 +4978,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, } EXT4_INODE_GET_CTIME(inode, raw_inode); - EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); - EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); + EXT4_INODE_GET_ATIME(inode, raw_inode); + EXT4_INODE_GET_MTIME(inode, raw_inode); EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { @@ -5015,13 +5065,21 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ret = -EFSCORRUPTED; goto bad_inode; } - if ((err_str = check_igot_inode(inode, flags)) != NULL) { - ext4_error_inode(inode, function, line, 0, err_str); - ret = -EFSCORRUPTED; - goto bad_inode; + ret = check_igot_inode(inode, flags, function, line); + /* + * -ESTALE here means there is nothing inherently wrong with the inode, + * it's just not an inode we can return for an fhandle lookup. + */ + if (ret == -ESTALE) { + brelse(iloc.bh); + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(-ESTALE); } - + if (ret) + goto bad_inode; brelse(iloc.bh); + unlock_new_inode(inode); return inode; @@ -5054,8 +5112,8 @@ static void __ext4_update_other_inode_time(struct super_block *sb, spin_lock(&ei->i_raw_lock); EXT4_INODE_SET_CTIME(inode, raw_inode); - EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); - EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); + EXT4_INODE_SET_MTIME(inode, raw_inode); + EXT4_INODE_SET_ATIME(inode, raw_inode); ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); trace_ext4_other_inode_update_time(inode, orig_ino); @@ -5437,6 +5495,14 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } if (attr->ia_size != inode->i_size) { + /* attach jbd2 jinode for EOF folio tail zeroing */ + if (attr->ia_size & (inode->i_sb->s_blocksize - 1) || + oldsize & (inode->i_sb->s_blocksize - 1)) { + error = ext4_inode_attach_jinode(inode); + if (error) + goto out_mmap_sem; + } + handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); @@ -5447,11 +5513,17 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, orphan = 1; } /* - * Update c/mtime on truncate up, ext4_truncate() will - * update c/mtime in shrink case below + * Update c/mtime and tail zero the EOF folio on + * truncate up. ext4_truncate() handles the shrink case + * below. */ - if (!shrink) - inode->i_mtime = inode_set_ctime_current(inode); + if (!shrink) { + inode_set_mtime_to_ts(inode, + inode_set_ctime_current(inode)); + if (oldsize & (inode->i_sb->s_blocksize - 1)) + ext4_block_truncate_page(handle, + inode->i_mapping, oldsize); + } if (shrink) ext4_fc_track_range(handle, inode, @@ -6199,7 +6271,8 @@ retry_alloc: if (folio_pos(folio) + len > size) len = size - folio_pos(folio); - err = __block_write_begin(&folio->page, 0, len, ext4_get_block); + err = ext4_block_write_begin(handle, folio, 0, len, + ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; if (ext4_journal_folio_buffers(handle, folio, len)) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 0bfe2ce589e2..1c77400bd88e 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -312,13 +312,22 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) struct ext4_inode_info *ei1; struct ext4_inode_info *ei2; unsigned long tmp; + struct timespec64 ts1, ts2; ei1 = EXT4_I(inode1); ei2 = EXT4_I(inode2); swap(inode1->i_version, inode2->i_version); - swap(inode1->i_atime, inode2->i_atime); - swap(inode1->i_mtime, inode2->i_mtime); + + ts1 = inode_get_atime(inode1); + ts2 = inode_get_atime(inode2); + inode_set_atime_to_ts(inode1, ts2); + inode_set_atime_to_ts(inode2, ts1); + + ts1 = inode_get_mtime(inode1); + ts2 = inode_get_mtime(inode2); + inode_set_mtime_to_ts(inode1, ts2); + inode_set_mtime_to_ts(inode2, ts1); memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP; @@ -458,7 +467,7 @@ static long swap_inode_boot_loader(struct super_block *sb, ext4_reset_inode_seed(inode); ext4_reset_inode_seed(inode_bl); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); err = ext4_mark_inode_dirty(handle, inode); if (err < 0) { @@ -810,11 +819,11 @@ int ext4_force_shutdown(struct super_block *sb, u32 flags) switch (flags) { case EXT4_GOING_FLAGS_DEFAULT: - ret = freeze_bdev(sb->s_bdev); + ret = bdev_freeze(sb->s_bdev); if (ret) return ret; set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); - thaw_bdev(sb->s_bdev); + bdev_thaw(sb->s_bdev); break; case EXT4_GOING_FLAGS_LOGFLUSH: set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); @@ -1141,9 +1150,8 @@ static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label */ BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX); - memset(label, 0, sizeof(label)); lock_buffer(sbi->s_sbh); - strncpy(label, sbi->s_es->s_volume_name, EXT4_LABEL_MAX); + memtostr_pad(label, sbi->s_es->s_volume_name); unlock_buffer(sbi->s_sbh); if (copy_to_user(user_label, label, sizeof(label))) @@ -1335,10 +1343,10 @@ group_extend_out: me.moved_len = 0; donor = fdget(me.donor_fd); - if (!donor.file) + if (!fd_file(donor)) return -EBADF; - if (!(donor.file->f_mode & FMODE_WRITE)) { + if (!(fd_file(donor)->f_mode & FMODE_WRITE)) { err = -EBADF; goto mext_out; } @@ -1359,7 +1367,7 @@ group_extend_out: if (err) goto mext_out; - err = ext4_move_extents(filp, donor.file, me.orig_start, + err = ext4_move_extents(filp, fd_file(donor), me.orig_start, me.donor_start, me.len, &me.moved_len); mnt_drop_write_file(filp); diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c new file mode 100644 index 000000000000..f13db95284d9 --- /dev/null +++ b/fs/ext4/mballoc-test.c @@ -0,0 +1,1001 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit test of ext4 multiblocks allocation. + */ + +#include <kunit/test.h> +#include <kunit/static_stub.h> +#include <linux/random.h> + +#include "ext4.h" + +struct mbt_grp_ctx { + struct buffer_head bitmap_bh; + /* desc and gd_bh are just the place holders for now */ + struct ext4_group_desc desc; + struct buffer_head gd_bh; +}; + +struct mbt_ctx { + struct mbt_grp_ctx *grp_ctx; +}; + +struct mbt_ext4_super_block { + struct ext4_super_block es; + struct ext4_sb_info sbi; + struct mbt_ctx mbt_ctx; +}; + +#define MBT_SB(_sb) (container_of((_sb)->s_fs_info, struct mbt_ext4_super_block, sbi)) +#define MBT_CTX(_sb) (&MBT_SB(_sb)->mbt_ctx) +#define MBT_GRP_CTX(_sb, _group) (&MBT_CTX(_sb)->grp_ctx[_group]) + +static struct inode *mbt_alloc_inode(struct super_block *sb) +{ + struct ext4_inode_info *ei; + + ei = kmalloc(sizeof(struct ext4_inode_info), GFP_KERNEL); + if (!ei) + return NULL; + + INIT_LIST_HEAD(&ei->i_orphan); + init_rwsem(&ei->xattr_sem); + init_rwsem(&ei->i_data_sem); + inode_init_once(&ei->vfs_inode); + ext4_fc_init_inode(&ei->vfs_inode); + + return &ei->vfs_inode; +} + +static void mbt_free_inode(struct inode *inode) +{ + kfree(EXT4_I(inode)); +} + +static const struct super_operations mbt_sops = { + .alloc_inode = mbt_alloc_inode, + .free_inode = mbt_free_inode, +}; + +static void mbt_kill_sb(struct super_block *sb) +{ + generic_shutdown_super(sb); +} + +static struct file_system_type mbt_fs_type = { + .name = "mballoc test", + .kill_sb = mbt_kill_sb, +}; + +static int mbt_mb_init(struct super_block *sb) +{ + ext4_fsblk_t block; + int ret; + + /* needed by ext4_mb_init->bdev_nonrot(sb->s_bdev) */ + sb->s_bdev = kzalloc(sizeof(*sb->s_bdev), GFP_KERNEL); + if (sb->s_bdev == NULL) + return -ENOMEM; + + sb->s_bdev->bd_queue = kzalloc(sizeof(struct request_queue), GFP_KERNEL); + if (sb->s_bdev->bd_queue == NULL) { + kfree(sb->s_bdev); + return -ENOMEM; + } + + /* + * needed by ext4_mb_init->ext4_mb_init_backend-> sbi->s_buddy_cache = + * new_inode(sb); + */ + INIT_LIST_HEAD(&sb->s_inodes); + sb->s_op = &mbt_sops; + + ret = ext4_mb_init(sb); + if (ret != 0) + goto err_out; + + block = ext4_count_free_clusters(sb); + ret = percpu_counter_init(&EXT4_SB(sb)->s_freeclusters_counter, block, + GFP_KERNEL); + if (ret != 0) + goto err_mb_release; + + ret = percpu_counter_init(&EXT4_SB(sb)->s_dirtyclusters_counter, 0, + GFP_KERNEL); + if (ret != 0) + goto err_freeclusters; + + return 0; + +err_freeclusters: + percpu_counter_destroy(&EXT4_SB(sb)->s_freeclusters_counter); +err_mb_release: + ext4_mb_release(sb); +err_out: + kfree(sb->s_bdev->bd_queue); + kfree(sb->s_bdev); + return ret; +} + +static void mbt_mb_release(struct super_block *sb) +{ + percpu_counter_destroy(&EXT4_SB(sb)->s_dirtyclusters_counter); + percpu_counter_destroy(&EXT4_SB(sb)->s_freeclusters_counter); + ext4_mb_release(sb); + kfree(sb->s_bdev->bd_queue); + kfree(sb->s_bdev); +} + +static int mbt_set(struct super_block *sb, void *data) +{ + return 0; +} + +static struct super_block *mbt_ext4_alloc_super_block(void) +{ + struct mbt_ext4_super_block *fsb; + struct super_block *sb; + struct ext4_sb_info *sbi; + + fsb = kzalloc(sizeof(*fsb), GFP_KERNEL); + if (fsb == NULL) + return NULL; + + sb = sget(&mbt_fs_type, NULL, mbt_set, 0, NULL); + if (IS_ERR(sb)) + goto out; + + sbi = &fsb->sbi; + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); + if (!sbi->s_blockgroup_lock) + goto out_deactivate; + + bgl_lock_init(sbi->s_blockgroup_lock); + + sbi->s_es = &fsb->es; + sbi->s_sb = sb; + sb->s_fs_info = sbi; + + up_write(&sb->s_umount); + return sb; + +out_deactivate: + deactivate_locked_super(sb); +out: + kfree(fsb); + return NULL; +} + +static void mbt_ext4_free_super_block(struct super_block *sb) +{ + struct mbt_ext4_super_block *fsb = MBT_SB(sb); + struct ext4_sb_info *sbi = EXT4_SB(sb); + + kfree(sbi->s_blockgroup_lock); + deactivate_super(sb); + kfree(fsb); +} + +struct mbt_ext4_block_layout { + unsigned char blocksize_bits; + unsigned int cluster_bits; + uint32_t blocks_per_group; + ext4_group_t group_count; + uint16_t desc_size; +}; + +static void mbt_init_sb_layout(struct super_block *sb, + struct mbt_ext4_block_layout *layout) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + + sb->s_blocksize = 1UL << layout->blocksize_bits; + sb->s_blocksize_bits = layout->blocksize_bits; + + sbi->s_groups_count = layout->group_count; + sbi->s_blocks_per_group = layout->blocks_per_group; + sbi->s_cluster_bits = layout->cluster_bits; + sbi->s_cluster_ratio = 1U << layout->cluster_bits; + sbi->s_clusters_per_group = layout->blocks_per_group >> + layout->cluster_bits; + sbi->s_desc_size = layout->desc_size; + sbi->s_desc_per_block_bits = + sb->s_blocksize_bits - (fls(layout->desc_size) - 1); + sbi->s_desc_per_block = 1 << sbi->s_desc_per_block_bits; + + es->s_first_data_block = cpu_to_le32(0); + es->s_blocks_count_lo = cpu_to_le32(layout->blocks_per_group * + layout->group_count); +} + +static int mbt_grp_ctx_init(struct super_block *sb, + struct mbt_grp_ctx *grp_ctx) +{ + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + + grp_ctx->bitmap_bh.b_data = kzalloc(EXT4_BLOCK_SIZE(sb), GFP_KERNEL); + if (grp_ctx->bitmap_bh.b_data == NULL) + return -ENOMEM; + mb_set_bits(grp_ctx->bitmap_bh.b_data, max, sb->s_blocksize * 8 - max); + ext4_free_group_clusters_set(sb, &grp_ctx->desc, max); + + return 0; +} + +static void mbt_grp_ctx_release(struct mbt_grp_ctx *grp_ctx) +{ + kfree(grp_ctx->bitmap_bh.b_data); + grp_ctx->bitmap_bh.b_data = NULL; +} + +static void mbt_ctx_mark_used(struct super_block *sb, ext4_group_t group, + unsigned int start, unsigned int len) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group); + + mb_set_bits(grp_ctx->bitmap_bh.b_data, start, len); +} + +static void *mbt_ctx_bitmap(struct super_block *sb, ext4_group_t group) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group); + + return grp_ctx->bitmap_bh.b_data; +} + +/* called after mbt_init_sb_layout */ +static int mbt_ctx_init(struct super_block *sb) +{ + struct mbt_ctx *ctx = MBT_CTX(sb); + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + + ctx->grp_ctx = kcalloc(ngroups, sizeof(struct mbt_grp_ctx), + GFP_KERNEL); + if (ctx->grp_ctx == NULL) + return -ENOMEM; + + for (i = 0; i < ngroups; i++) + if (mbt_grp_ctx_init(sb, &ctx->grp_ctx[i])) + goto out; + + /* + * first data block(first cluster in first group) is used by + * metadata, mark it used to avoid to alloc data block at first + * block which will fail ext4_sb_block_valid check. + */ + mb_set_bits(ctx->grp_ctx[0].bitmap_bh.b_data, 0, 1); + ext4_free_group_clusters_set(sb, &ctx->grp_ctx[0].desc, + EXT4_CLUSTERS_PER_GROUP(sb) - 1); + + return 0; +out: + while (i-- > 0) + mbt_grp_ctx_release(&ctx->grp_ctx[i]); + kfree(ctx->grp_ctx); + return -ENOMEM; +} + +static void mbt_ctx_release(struct super_block *sb) +{ + struct mbt_ctx *ctx = MBT_CTX(sb); + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + + for (i = 0; i < ngroups; i++) + mbt_grp_ctx_release(&ctx->grp_ctx[i]); + kfree(ctx->grp_ctx); +} + +static struct buffer_head * +ext4_read_block_bitmap_nowait_stub(struct super_block *sb, ext4_group_t block_group, + bool ignore_locked) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group); + + /* paired with brelse from caller of ext4_read_block_bitmap_nowait */ + get_bh(&grp_ctx->bitmap_bh); + return &grp_ctx->bitmap_bh; +} + +static int ext4_wait_block_bitmap_stub(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh) +{ + /* + * real ext4_wait_block_bitmap will set these flags and + * functions like ext4_mb_init_cache will verify the flags. + */ + set_buffer_uptodate(bh); + set_bitmap_uptodate(bh); + set_buffer_verified(bh); + return 0; +} + +static struct ext4_group_desc * +ext4_get_group_desc_stub(struct super_block *sb, ext4_group_t block_group, + struct buffer_head **bh) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group); + + if (bh != NULL) + *bh = &grp_ctx->gd_bh; + + return &grp_ctx->desc; +} + +static int +ext4_mb_mark_context_stub(handle_t *handle, struct super_block *sb, bool state, + ext4_group_t group, ext4_grpblk_t blkoff, + ext4_grpblk_t len, int flags, + ext4_grpblk_t *ret_changed) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group); + struct buffer_head *bitmap_bh = &grp_ctx->bitmap_bh; + + if (state) + mb_set_bits(bitmap_bh->b_data, blkoff, len); + else + mb_clear_bits(bitmap_bh->b_data, blkoff, len); + + return 0; +} + +#define TEST_GOAL_GROUP 1 +static int mbt_kunit_init(struct kunit *test) +{ + struct mbt_ext4_block_layout *layout = + (struct mbt_ext4_block_layout *)(test->param_value); + struct super_block *sb; + int ret; + + sb = mbt_ext4_alloc_super_block(); + if (sb == NULL) + return -ENOMEM; + + mbt_init_sb_layout(sb, layout); + + ret = mbt_ctx_init(sb); + if (ret != 0) { + mbt_ext4_free_super_block(sb); + return ret; + } + + test->priv = sb; + kunit_activate_static_stub(test, + ext4_read_block_bitmap_nowait, + ext4_read_block_bitmap_nowait_stub); + kunit_activate_static_stub(test, + ext4_wait_block_bitmap, + ext4_wait_block_bitmap_stub); + kunit_activate_static_stub(test, + ext4_get_group_desc, + ext4_get_group_desc_stub); + kunit_activate_static_stub(test, + ext4_mb_mark_context, + ext4_mb_mark_context_stub); + + /* stub function will be called in mbt_mb_init->ext4_mb_init */ + if (mbt_mb_init(sb) != 0) { + mbt_ctx_release(sb); + mbt_ext4_free_super_block(sb); + return -ENOMEM; + } + + return 0; +} + +static void mbt_kunit_exit(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + + mbt_mb_release(sb); + mbt_ctx_release(sb); + mbt_ext4_free_super_block(sb); +} + +static void test_new_blocks_simple(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + struct inode *inode; + struct ext4_allocation_request ar; + ext4_group_t i, goal_group = TEST_GOAL_GROUP; + int err = 0; + ext4_fsblk_t found; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL); + if (!inode) + return; + + inode->i_sb = sb; + ar.inode = inode; + + /* get block at goal */ + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_EQ_MSG(test, ar.goal, found, + "failed to alloc block at goal, expected %llu found %llu", + ar.goal, found); + + /* get block after goal in goal group */ + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_EQ_MSG(test, ar.goal + EXT4_C2B(sbi, 1), found, + "failed to alloc block after goal in goal group, expected %llu found %llu", + ar.goal + 1, found); + + /* get block after goal group */ + mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb)); + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_EQ_MSG(test, + ext4_group_first_block_no(sb, goal_group + 1), found, + "failed to alloc block after goal group, expected %llu found %llu", + ext4_group_first_block_no(sb, goal_group + 1), found); + + /* get block before goal group */ + for (i = goal_group; i < ext4_get_groups_count(sb); i++) + mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb)); + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_EQ_MSG(test, + ext4_group_first_block_no(sb, 0) + EXT4_C2B(sbi, 1), found, + "failed to alloc block before goal group, expected %llu found %llu", + ext4_group_first_block_no(sb, 0 + EXT4_C2B(sbi, 1)), found); + + /* no block available, fail to allocate block */ + for (i = 0; i < ext4_get_groups_count(sb); i++) + mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb)); + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_NE_MSG(test, err, 0, + "unexpectedly get block when no block is available"); +} + +#define TEST_RANGE_COUNT 8 + +struct test_range { + ext4_grpblk_t start; + ext4_grpblk_t len; +}; + +static void +mbt_generate_test_ranges(struct super_block *sb, struct test_range *ranges, + int count) +{ + ext4_grpblk_t start, len, max; + int i; + + max = EXT4_CLUSTERS_PER_GROUP(sb) / count; + for (i = 0; i < count; i++) { + start = get_random_u32() % max; + len = get_random_u32() % max; + len = min(len, max - start); + + ranges[i].start = start + i * max; + ranges[i].len = len; + } +} + +static void +validate_free_blocks_simple(struct kunit *test, struct super_block *sb, + ext4_group_t goal_group, ext4_grpblk_t start, + ext4_grpblk_t len) +{ + void *bitmap; + ext4_grpblk_t bit, max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_group_t i; + + for (i = 0; i < ext4_get_groups_count(sb); i++) { + if (i == goal_group) + continue; + + bitmap = mbt_ctx_bitmap(sb, i); + bit = mb_find_next_zero_bit(bitmap, max, 0); + KUNIT_ASSERT_EQ_MSG(test, bit, max, + "free block on unexpected group %d", i); + } + + bitmap = mbt_ctx_bitmap(sb, goal_group); + bit = mb_find_next_zero_bit(bitmap, max, 0); + KUNIT_ASSERT_EQ(test, bit, start); + + bit = mb_find_next_bit(bitmap, max, bit + 1); + KUNIT_ASSERT_EQ(test, bit, start + len); +} + +static void +test_free_blocks_simple_range(struct kunit *test, ext4_group_t goal_group, + ext4_grpblk_t start, ext4_grpblk_t len) +{ + struct super_block *sb = (struct super_block *)test->priv; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode; + ext4_fsblk_t block; + + inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL); + if (!inode) + return; + inode->i_sb = sb; + + if (len == 0) + return; + + block = ext4_group_first_block_no(sb, goal_group) + + EXT4_C2B(sbi, start); + ext4_free_blocks_simple(inode, block, len); + validate_free_blocks_simple(test, sb, goal_group, start, len); + mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb)); +} + +static void test_free_blocks_simple(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_group_t i; + struct test_range ranges[TEST_RANGE_COUNT]; + + for (i = 0; i < ext4_get_groups_count(sb); i++) + mbt_ctx_mark_used(sb, i, 0, max); + + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + for (i = 0; i < TEST_RANGE_COUNT; i++) + test_free_blocks_simple_range(test, TEST_GOAL_GROUP, + ranges[i].start, ranges[i].len); +} + +static void +test_mark_diskspace_used_range(struct kunit *test, + struct ext4_allocation_context *ac, + ext4_grpblk_t start, + ext4_grpblk_t len) +{ + struct super_block *sb = (struct super_block *)test->priv; + int ret; + void *bitmap; + ext4_grpblk_t i, max; + + /* ext4_mb_mark_diskspace_used will BUG if len is 0 */ + if (len == 0) + return; + + ac->ac_b_ex.fe_group = TEST_GOAL_GROUP; + ac->ac_b_ex.fe_start = start; + ac->ac_b_ex.fe_len = len; + + bitmap = mbt_ctx_bitmap(sb, TEST_GOAL_GROUP); + memset(bitmap, 0, sb->s_blocksize); + ret = ext4_mb_mark_diskspace_used(ac, NULL, 0); + KUNIT_ASSERT_EQ(test, ret, 0); + + max = EXT4_CLUSTERS_PER_GROUP(sb); + i = mb_find_next_bit(bitmap, max, 0); + KUNIT_ASSERT_EQ(test, i, start); + i = mb_find_next_zero_bit(bitmap, max, i + 1); + KUNIT_ASSERT_EQ(test, i, start + len); + i = mb_find_next_bit(bitmap, max, i + 1); + KUNIT_ASSERT_EQ(test, max, i); +} + +static void test_mark_diskspace_used(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + struct inode *inode; + struct ext4_allocation_context ac; + struct test_range ranges[TEST_RANGE_COUNT]; + int i; + + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + + inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL); + if (!inode) + return; + inode->i_sb = sb; + + ac.ac_status = AC_STATUS_FOUND; + ac.ac_sb = sb; + ac.ac_inode = inode; + for (i = 0; i < TEST_RANGE_COUNT; i++) + test_mark_diskspace_used_range(test, &ac, ranges[i].start, + ranges[i].len); +} + +static void mbt_generate_buddy(struct super_block *sb, void *buddy, + void *bitmap, struct ext4_group_info *grp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + uint32_t order, off; + void *bb, *bb_h; + int max; + + memset(buddy, 0xff, sb->s_blocksize); + memset(grp, 0, offsetof(struct ext4_group_info, + bb_counters[MB_NUM_ORDERS(sb)])); + + bb = bitmap; + max = EXT4_CLUSTERS_PER_GROUP(sb); + bb_h = buddy + sbi->s_mb_offsets[1]; + + off = mb_find_next_zero_bit(bb, max, 0); + grp->bb_first_free = off; + while (off < max) { + grp->bb_counters[0]++; + grp->bb_free++; + + if (!(off & 1) && !mb_test_bit(off + 1, bb)) { + grp->bb_free++; + grp->bb_counters[0]--; + mb_clear_bit(off >> 1, bb_h); + grp->bb_counters[1]++; + grp->bb_largest_free_order = 1; + off++; + } + + off = mb_find_next_zero_bit(bb, max, off + 1); + } + + for (order = 1; order < MB_NUM_ORDERS(sb) - 1; order++) { + bb = buddy + sbi->s_mb_offsets[order]; + bb_h = buddy + sbi->s_mb_offsets[order + 1]; + max = max >> 1; + off = mb_find_next_zero_bit(bb, max, 0); + + while (off < max) { + if (!(off & 1) && !mb_test_bit(off + 1, bb)) { + mb_set_bits(bb, off, 2); + grp->bb_counters[order] -= 2; + mb_clear_bit(off >> 1, bb_h); + grp->bb_counters[order + 1]++; + grp->bb_largest_free_order = order + 1; + off++; + } + + off = mb_find_next_zero_bit(bb, max, off + 1); + } + } + + max = EXT4_CLUSTERS_PER_GROUP(sb); + off = mb_find_next_zero_bit(bitmap, max, 0); + while (off < max) { + grp->bb_fragments++; + + off = mb_find_next_bit(bitmap, max, off + 1); + if (off + 1 >= max) + break; + + off = mb_find_next_zero_bit(bitmap, max, off + 1); + } +} + +static void +mbt_validate_group_info(struct kunit *test, struct ext4_group_info *grp1, + struct ext4_group_info *grp2) +{ + struct super_block *sb = (struct super_block *)test->priv; + int i; + + KUNIT_ASSERT_EQ(test, grp1->bb_first_free, + grp2->bb_first_free); + KUNIT_ASSERT_EQ(test, grp1->bb_fragments, + grp2->bb_fragments); + KUNIT_ASSERT_EQ(test, grp1->bb_free, grp2->bb_free); + KUNIT_ASSERT_EQ(test, grp1->bb_largest_free_order, + grp2->bb_largest_free_order); + + for (i = 1; i < MB_NUM_ORDERS(sb); i++) { + KUNIT_ASSERT_EQ_MSG(test, grp1->bb_counters[i], + grp2->bb_counters[i], + "bb_counters[%d] diffs, expected %d, generated %d", + i, grp1->bb_counters[i], + grp2->bb_counters[i]); + } +} + +static void +do_test_generate_buddy(struct kunit *test, struct super_block *sb, void *bitmap, + void *mbt_buddy, struct ext4_group_info *mbt_grp, + void *ext4_buddy, struct ext4_group_info *ext4_grp) +{ + int i; + + mbt_generate_buddy(sb, mbt_buddy, bitmap, mbt_grp); + + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + ext4_grp->bb_counters[i] = 0; + /* needed by validation in ext4_mb_generate_buddy */ + ext4_grp->bb_free = mbt_grp->bb_free; + memset(ext4_buddy, 0xff, sb->s_blocksize); + ext4_mb_generate_buddy(sb, ext4_buddy, bitmap, TEST_GOAL_GROUP, + ext4_grp); + + KUNIT_ASSERT_EQ(test, memcmp(mbt_buddy, ext4_buddy, sb->s_blocksize), + 0); + mbt_validate_group_info(test, mbt_grp, ext4_grp); +} + +static void test_mb_generate_buddy(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + void *bitmap, *expected_bb, *generate_bb; + struct ext4_group_info *expected_grp, *generate_grp; + struct test_range ranges[TEST_RANGE_COUNT]; + int i; + + bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap); + expected_bb = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, expected_bb); + generate_bb = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, generate_bb); + expected_grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, + bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, expected_grp); + generate_grp = ext4_get_group_info(sb, TEST_GOAL_GROUP); + KUNIT_ASSERT_NOT_NULL(test, generate_grp); + + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + for (i = 0; i < TEST_RANGE_COUNT; i++) { + mb_set_bits(bitmap, ranges[i].start, ranges[i].len); + do_test_generate_buddy(test, sb, bitmap, expected_bb, + expected_grp, generate_bb, generate_grp); + } +} + +static void +test_mb_mark_used_range(struct kunit *test, struct ext4_buddy *e4b, + ext4_grpblk_t start, ext4_grpblk_t len, void *bitmap, + void *buddy, struct ext4_group_info *grp) +{ + struct super_block *sb = (struct super_block *)test->priv; + struct ext4_free_extent ex; + int i; + + /* mb_mark_used only accepts non-zero len */ + if (len == 0) + return; + + ex.fe_start = start; + ex.fe_len = len; + ex.fe_group = TEST_GOAL_GROUP; + + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_mark_used(e4b, &ex); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + + mb_set_bits(bitmap, start, len); + /* bypass bb_free validatoin in ext4_mb_generate_buddy */ + grp->bb_free -= len; + memset(buddy, 0xff, sb->s_blocksize); + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + grp->bb_counters[i] = 0; + ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp); + + KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize), + 0); + mbt_validate_group_info(test, grp, e4b->bd_info); +} + +static void test_mb_mark_used(struct kunit *test) +{ + struct ext4_buddy e4b; + struct super_block *sb = (struct super_block *)test->priv; + void *bitmap, *buddy; + struct ext4_group_info *grp; + int ret; + struct test_range ranges[TEST_RANGE_COUNT]; + int i; + + /* buddy cache assumes that each page contains at least one block */ + if (sb->s_blocksize > PAGE_SIZE) + kunit_skip(test, "blocksize exceeds pagesize"); + + bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap); + buddy = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy); + grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, + bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + + ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + KUNIT_ASSERT_EQ(test, ret, 0); + + grp->bb_free = EXT4_CLUSTERS_PER_GROUP(sb); + grp->bb_largest_free_order = -1; + grp->bb_avg_fragment_size_order = -1; + INIT_LIST_HEAD(&grp->bb_largest_free_order_node); + INIT_LIST_HEAD(&grp->bb_avg_fragment_size_node); + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + for (i = 0; i < TEST_RANGE_COUNT; i++) + test_mb_mark_used_range(test, &e4b, ranges[i].start, + ranges[i].len, bitmap, buddy, grp); + + ext4_mb_unload_buddy(&e4b); +} + +static void +test_mb_free_blocks_range(struct kunit *test, struct ext4_buddy *e4b, + ext4_grpblk_t start, ext4_grpblk_t len, void *bitmap, + void *buddy, struct ext4_group_info *grp) +{ + struct super_block *sb = (struct super_block *)test->priv; + int i; + + /* mb_free_blocks will WARN if len is 0 */ + if (len == 0) + return; + + ext4_lock_group(sb, e4b->bd_group); + mb_free_blocks(NULL, e4b, start, len); + ext4_unlock_group(sb, e4b->bd_group); + + mb_clear_bits(bitmap, start, len); + /* bypass bb_free validatoin in ext4_mb_generate_buddy */ + grp->bb_free += len; + memset(buddy, 0xff, sb->s_blocksize); + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + grp->bb_counters[i] = 0; + ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp); + + KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize), + 0); + mbt_validate_group_info(test, grp, e4b->bd_info); + +} + +static void test_mb_free_blocks(struct kunit *test) +{ + struct ext4_buddy e4b; + struct super_block *sb = (struct super_block *)test->priv; + void *bitmap, *buddy; + struct ext4_group_info *grp; + struct ext4_free_extent ex; + int ret; + int i; + struct test_range ranges[TEST_RANGE_COUNT]; + + /* buddy cache assumes that each page contains at least one block */ + if (sb->s_blocksize > PAGE_SIZE) + kunit_skip(test, "blocksize exceeds pagesize"); + + bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap); + buddy = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy); + grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, + bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + + ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + KUNIT_ASSERT_EQ(test, ret, 0); + + ex.fe_start = 0; + ex.fe_len = EXT4_CLUSTERS_PER_GROUP(sb); + ex.fe_group = TEST_GOAL_GROUP; + + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_mark_used(&e4b, &ex); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + + grp->bb_free = 0; + grp->bb_largest_free_order = -1; + grp->bb_avg_fragment_size_order = -1; + INIT_LIST_HEAD(&grp->bb_largest_free_order_node); + INIT_LIST_HEAD(&grp->bb_avg_fragment_size_node); + memset(bitmap, 0xff, sb->s_blocksize); + + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + for (i = 0; i < TEST_RANGE_COUNT; i++) + test_mb_free_blocks_range(test, &e4b, ranges[i].start, + ranges[i].len, bitmap, buddy, grp); + + ext4_mb_unload_buddy(&e4b); +} + +#define COUNT_FOR_ESTIMATE 100000 +static void test_mb_mark_used_cost(struct kunit *test) +{ + struct ext4_buddy e4b; + struct super_block *sb = (struct super_block *)test->priv; + struct ext4_free_extent ex; + int ret; + struct test_range ranges[TEST_RANGE_COUNT]; + int i, j; + unsigned long start, end, all = 0; + + /* buddy cache assumes that each page contains at least one block */ + if (sb->s_blocksize > PAGE_SIZE) + kunit_skip(test, "blocksize exceeds pagesize"); + + ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + KUNIT_ASSERT_EQ(test, ret, 0); + + ex.fe_group = TEST_GOAL_GROUP; + for (j = 0; j < COUNT_FOR_ESTIMATE; j++) { + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + start = jiffies; + for (i = 0; i < TEST_RANGE_COUNT; i++) { + if (ranges[i].len == 0) + continue; + + ex.fe_start = ranges[i].start; + ex.fe_len = ranges[i].len; + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_mark_used(&e4b, &ex); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + } + end = jiffies; + all += (end - start); + + for (i = 0; i < TEST_RANGE_COUNT; i++) { + if (ranges[i].len == 0) + continue; + + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_free_blocks(NULL, &e4b, ranges[i].start, + ranges[i].len); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + } + } + + kunit_info(test, "costed jiffies %lu\n", all); + ext4_mb_unload_buddy(&e4b); +} + +static const struct mbt_ext4_block_layout mbt_test_layouts[] = { + { + .blocksize_bits = 10, + .cluster_bits = 3, + .blocks_per_group = 8192, + .group_count = 4, + .desc_size = 64, + }, + { + .blocksize_bits = 12, + .cluster_bits = 3, + .blocks_per_group = 8192, + .group_count = 4, + .desc_size = 64, + }, + { + .blocksize_bits = 16, + .cluster_bits = 3, + .blocks_per_group = 8192, + .group_count = 4, + .desc_size = 64, + }, +}; + +static void mbt_show_layout(const struct mbt_ext4_block_layout *layout, + char *desc) +{ + snprintf(desc, KUNIT_PARAM_DESC_SIZE, "block_bits=%d cluster_bits=%d " + "blocks_per_group=%d group_count=%d desc_size=%d\n", + layout->blocksize_bits, layout->cluster_bits, + layout->blocks_per_group, layout->group_count, + layout->desc_size); +} +KUNIT_ARRAY_PARAM(mbt_layouts, mbt_test_layouts, mbt_show_layout); + +static struct kunit_case mbt_test_cases[] = { + KUNIT_CASE_PARAM(test_new_blocks_simple, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_free_blocks_simple, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_mb_generate_buddy, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_mb_mark_used, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_mb_free_blocks, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_mark_diskspace_used, mbt_layouts_gen_params), + KUNIT_CASE_PARAM_ATTR(test_mb_mark_used_cost, mbt_layouts_gen_params, + { .speed = KUNIT_SPEED_SLOW }), + {} +}; + +static struct kunit_suite mbt_test_suite = { + .name = "ext4_mballoc_test", + .init = mbt_kunit_init, + .exit = mbt_kunit_exit, + .test_cases = mbt_test_cases, +}; + +kunit_test_suites(&mbt_test_suite); + +MODULE_LICENSE("GPL"); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8a9f8c95c6f1..76331cdb4cb5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -18,6 +18,7 @@ #include <linux/backing-dev.h> #include <linux/freezer.h> #include <trace/events/ext4.h> +#include <kunit/static_stub.h> /* * MUSTDO: @@ -563,14 +564,14 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(EXT4_SB(sb), first + i); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, "freeing block already freed " "(bit %u)", first + i); - ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); } mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); } @@ -676,7 +677,7 @@ do { \ } \ } while (0) -static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, +static void __mb_check_buddy(struct ext4_buddy *e4b, char *file, const char *function, int line) { struct super_block *sb = e4b->bd_sb; @@ -695,7 +696,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, void *buddy2; if (e4b->bd_info->bb_check_counter++ % 10) - return 0; + return; while (order > 1) { buddy = mb_find_buddy(e4b, order, &max); @@ -757,7 +758,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, grp = ext4_get_group_info(sb, e4b->bd_group); if (!grp) - return NULL; + return; list_for_each(cur, &grp->bb_prealloc_list) { ext4_group_t groupnr; struct ext4_prealloc_space *pa; @@ -767,7 +768,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, for (i = 0; i < pa->pa_len; i++) MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); } - return 0; } #undef MB_CHECK_ASSERT #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ @@ -841,30 +841,30 @@ static void mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); - int new_order; + int new, old; - if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_fragments == 0) + if (!test_opt2(sb, MB_OPTIMIZE_SCAN)) return; - new_order = mb_avg_fragment_size_order(sb, - grp->bb_free / grp->bb_fragments); - if (new_order == grp->bb_avg_fragment_size_order) + old = grp->bb_avg_fragment_size_order; + new = grp->bb_fragments == 0 ? -1 : + mb_avg_fragment_size_order(sb, grp->bb_free / grp->bb_fragments); + if (new == old) return; - if (grp->bb_avg_fragment_size_order != -1) { - write_lock(&sbi->s_mb_avg_fragment_size_locks[ - grp->bb_avg_fragment_size_order]); + if (old >= 0) { + write_lock(&sbi->s_mb_avg_fragment_size_locks[old]); list_del(&grp->bb_avg_fragment_size_node); - write_unlock(&sbi->s_mb_avg_fragment_size_locks[ - grp->bb_avg_fragment_size_order]); + write_unlock(&sbi->s_mb_avg_fragment_size_locks[old]); + } + + grp->bb_avg_fragment_size_order = new; + if (new >= 0) { + write_lock(&sbi->s_mb_avg_fragment_size_locks[new]); + list_add_tail(&grp->bb_avg_fragment_size_node, + &sbi->s_mb_avg_fragment_size[new]); + write_unlock(&sbi->s_mb_avg_fragment_size_locks[new]); } - grp->bb_avg_fragment_size_order = new_order; - write_lock(&sbi->s_mb_avg_fragment_size_locks[ - grp->bb_avg_fragment_size_order]); - list_add_tail(&grp->bb_avg_fragment_size_node, - &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]); - write_unlock(&sbi->s_mb_avg_fragment_size_locks[ - grp->bb_avg_fragment_size_order]); } /* @@ -872,7 +872,7 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) * cr level needs an update. */ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *iter; @@ -946,7 +946,7 @@ ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int o * order. Updates *new_cr if cr level needs an update. */ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = NULL; @@ -991,7 +991,7 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context * * much and fall to CR_GOAL_LEN_SLOW in that case. */ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = NULL; @@ -1080,23 +1080,11 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac) } /* - * Return next linear group for allocation. If linear traversal should not be - * performed, this function just returns the same group + * Return next linear group for allocation. */ static ext4_group_t -next_linear_group(struct ext4_allocation_context *ac, ext4_group_t group, - ext4_group_t ngroups) +next_linear_group(ext4_group_t group, ext4_group_t ngroups) { - if (!should_optimize_scan(ac)) - goto inc_and_return; - - if (ac->ac_groups_linear_remaining) { - ac->ac_groups_linear_remaining--; - goto inc_and_return; - } - - return group; -inc_and_return: /* * Artificially restricted ngroups for non-extent * files makes group > ngroups possible on first loop. @@ -1122,21 +1110,33 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, { *new_cr = ac->ac_criteria; - if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) { - *group = next_linear_group(ac, *group, ngroups); + if (!should_optimize_scan(ac)) { + *group = next_linear_group(*group, ngroups); + return; + } + + /* + * Optimized scanning can return non adjacent groups which can cause + * seek overhead for rotational disks. So try few linear groups before + * trying optimized scan. + */ + if (ac->ac_groups_linear_remaining) { + *group = next_linear_group(*group, ngroups); + ac->ac_groups_linear_remaining--; return; } if (*new_cr == CR_POWER2_ALIGNED) { - ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group, ngroups); + ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group); } else if (*new_cr == CR_GOAL_LEN_FAST) { - ext4_mb_choose_next_group_goal_fast(ac, new_cr, group, ngroups); + ext4_mb_choose_next_group_goal_fast(ac, new_cr, group); } else if (*new_cr == CR_BEST_AVAIL_LEN) { - ext4_mb_choose_next_group_best_avail(ac, new_cr, group, ngroups); + ext4_mb_choose_next_group_best_avail(ac, new_cr, group); } else { /* - * TODO: For CR=2, we can arrange groups in an rb tree sorted by - * bb_free. But until that happens, we should never come here. + * TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an + * rb tree sorted by bb_free. But until that happens, we should + * never come here. */ WARN_ON(1); } @@ -1150,33 +1150,28 @@ static void mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); - int i; + int new, old = grp->bb_largest_free_order; - for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) - if (grp->bb_counters[i] > 0) + for (new = MB_NUM_ORDERS(sb) - 1; new >= 0; new--) + if (grp->bb_counters[new] > 0) break; + /* No need to move between order lists? */ - if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || - i == grp->bb_largest_free_order) { - grp->bb_largest_free_order = i; + if (new == old) return; - } - if (grp->bb_largest_free_order >= 0) { - write_lock(&sbi->s_mb_largest_free_orders_locks[ - grp->bb_largest_free_order]); + if (old >= 0 && !list_empty(&grp->bb_largest_free_order_node)) { + write_lock(&sbi->s_mb_largest_free_orders_locks[old]); list_del_init(&grp->bb_largest_free_order_node); - write_unlock(&sbi->s_mb_largest_free_orders_locks[ - grp->bb_largest_free_order]); + write_unlock(&sbi->s_mb_largest_free_orders_locks[old]); } - grp->bb_largest_free_order = i; - if (grp->bb_largest_free_order >= 0 && grp->bb_free) { - write_lock(&sbi->s_mb_largest_free_orders_locks[ - grp->bb_largest_free_order]); + + grp->bb_largest_free_order = new; + if (test_opt2(sb, MB_OPTIMIZE_SCAN) && new >= 0 && grp->bb_free) { + write_lock(&sbi->s_mb_largest_free_orders_locks[new]); list_add_tail(&grp->bb_largest_free_order_node, - &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]); - write_unlock(&sbi->s_mb_largest_free_orders_locks[ - grp->bb_largest_free_order]); + &sbi->s_mb_largest_free_orders[new]); + write_unlock(&sbi->s_mb_largest_free_orders_locks[new]); } } @@ -1274,7 +1269,7 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b) * for this page; do not hold this lock when calling this routine! */ -static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) +static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) { ext4_group_t ngroups; unsigned int blocksize; @@ -1292,13 +1287,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) char *bitmap; struct ext4_group_info *grinfo; - inode = page->mapping->host; + inode = folio->mapping->host; sb = inode->i_sb; ngroups = ext4_get_groups_count(sb); blocksize = i_blocksize(inode); blocks_per_page = PAGE_SIZE / blocksize; - mb_debug(sb, "init page %lu\n", page->index); + mb_debug(sb, "init folio %lu\n", folio->index); groups_per_page = blocks_per_page >> 1; if (groups_per_page == 0) @@ -1313,9 +1308,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) } else bh = &bhs; - first_group = page->index * blocks_per_page / 2; + first_group = folio->index * blocks_per_page / 2; - /* read all groups the page covers into the cache */ + /* read all groups the folio covers into the cache */ for (i = 0, group = first_group; i < groups_per_page; i++, group++) { if (group >= ngroups) break; @@ -1326,10 +1321,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) /* * If page is uptodate then we came here after online resize * which added some new uninitialized group info structs, so - * we must skip all initialized uptodate buddies on the page, + * we must skip all initialized uptodate buddies on the folio, * which may be currently in use by an allocating task. */ - if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { + if (folio_test_uptodate(folio) && + !EXT4_MB_GRP_NEED_INIT(grinfo)) { bh[i] = NULL; continue; } @@ -1353,7 +1349,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) err = err2; } - first_block = page->index * blocks_per_page; + first_block = folio->index * blocks_per_page; for (i = 0; i < blocks_per_page; i++) { group = (first_block + i) >> 1; if (group >= ngroups) @@ -1374,7 +1370,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) * above * */ - data = page_address(page) + (i * blocksize); + data = folio_address(folio) + (i * blocksize); bitmap = bh[group - first_group]->b_data; /* @@ -1389,8 +1385,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) if ((first_block + i) & 1) { /* this is block of buddy */ BUG_ON(incore == NULL); - mb_debug(sb, "put buddy for group %u in page %lu/%x\n", - group, page->index, i * blocksize); + mb_debug(sb, "put buddy for group %u in folio %lu/%x\n", + group, folio->index, i * blocksize); trace_ext4_mb_buddy_bitmap_load(sb, group); grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, @@ -1408,8 +1404,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) } else { /* this is block of bitmap */ BUG_ON(incore != NULL); - mb_debug(sb, "put bitmap for group %u in page %lu/%x\n", - group, page->index, i * blocksize); + mb_debug(sb, "put bitmap for group %u in folio %lu/%x\n", + group, folio->index, i * blocksize); trace_ext4_mb_bitmap_load(sb, group); /* see comments in ext4_mb_put_pa() */ @@ -1427,7 +1423,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) incore = data; } } - SetPageUptodate(page); + folio_mark_uptodate(folio); out: if (bh) { @@ -1443,7 +1439,7 @@ out: * Lock the buddy and bitmap pages. This make sure other parallel init_group * on the same buddy page doesn't happen whild holding the buddy page lock. * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap - * are on the same page e4b->bd_buddy_page is NULL and return value is 0. + * are on the same page e4b->bd_buddy_folio is NULL and return value is 0. */ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) @@ -1451,10 +1447,10 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, struct inode *inode = EXT4_SB(sb)->s_buddy_cache; int block, pnum, poff; int blocks_per_page; - struct page *page; + struct folio *folio; - e4b->bd_buddy_page = NULL; - e4b->bd_bitmap_page = NULL; + e4b->bd_buddy_folio = NULL; + e4b->bd_bitmap_folio = NULL; blocks_per_page = PAGE_SIZE / sb->s_blocksize; /* @@ -1465,37 +1461,38 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, block = group * 2; pnum = block / blocks_per_page; poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, gfp); - if (!page) - return -ENOMEM; - BUG_ON(page->mapping != inode->i_mapping); - e4b->bd_bitmap_page = page; - e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); + folio = __filemap_get_folio(inode->i_mapping, pnum, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (IS_ERR(folio)) + return PTR_ERR(folio); + BUG_ON(folio->mapping != inode->i_mapping); + e4b->bd_bitmap_folio = folio; + e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); if (blocks_per_page >= 2) { /* buddy and bitmap are on the same page */ return 0; } - block++; - pnum = block / blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, gfp); - if (!page) - return -ENOMEM; - BUG_ON(page->mapping != inode->i_mapping); - e4b->bd_buddy_page = page; + /* blocks_per_page == 1, hence we need another page for the buddy */ + folio = __filemap_get_folio(inode->i_mapping, block + 1, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (IS_ERR(folio)) + return PTR_ERR(folio); + BUG_ON(folio->mapping != inode->i_mapping); + e4b->bd_buddy_folio = folio; return 0; } static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) { - if (e4b->bd_bitmap_page) { - unlock_page(e4b->bd_bitmap_page); - put_page(e4b->bd_bitmap_page); + if (e4b->bd_bitmap_folio) { + folio_unlock(e4b->bd_bitmap_folio); + folio_put(e4b->bd_bitmap_folio); } - if (e4b->bd_buddy_page) { - unlock_page(e4b->bd_buddy_page); - put_page(e4b->bd_buddy_page); + if (e4b->bd_buddy_folio) { + folio_unlock(e4b->bd_buddy_folio); + folio_put(e4b->bd_buddy_folio); } } @@ -1510,7 +1507,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) struct ext4_group_info *this_grp; struct ext4_buddy e4b; - struct page *page; + struct folio *folio; int ret = 0; might_sleep(); @@ -1537,16 +1534,16 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) goto err; } - page = e4b.bd_bitmap_page; - ret = ext4_mb_init_cache(page, NULL, gfp); + folio = e4b.bd_bitmap_folio; + ret = ext4_mb_init_cache(folio, NULL, gfp); if (ret) goto err; - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } - if (e4b.bd_buddy_page == NULL) { + if (e4b.bd_buddy_folio == NULL) { /* * If both the bitmap and buddy are in * the same page we don't need to force @@ -1556,11 +1553,11 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) goto err; } /* init buddy cache */ - page = e4b.bd_buddy_page; - ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp); + folio = e4b.bd_buddy_folio; + ret = ext4_mb_init_cache(folio, e4b.bd_bitmap, gfp); if (ret) goto err; - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } @@ -1582,7 +1579,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, int block; int pnum; int poff; - struct page *page; + struct folio *folio; int ret; struct ext4_group_info *grp; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1600,8 +1597,8 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, e4b->bd_info = grp; e4b->bd_sb = sb; e4b->bd_group = group; - e4b->bd_buddy_page = NULL; - e4b->bd_bitmap_page = NULL; + e4b->bd_buddy_folio = NULL; + e4b->bd_bitmap_folio = NULL; if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { /* @@ -1622,102 +1619,103 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, pnum = block / blocks_per_page; poff = block % blocks_per_page; - /* we could use find_or_create_page(), but it locks page - * what we'd like to avoid in fast path ... */ - page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); - if (page == NULL || !PageUptodate(page)) { - if (page) + /* Avoid locking the folio in the fast path ... */ + folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); + if (IS_ERR(folio) || !folio_test_uptodate(folio)) { + if (!IS_ERR(folio)) /* - * drop the page reference and try - * to get the page with lock. If we + * drop the folio reference and try + * to get the folio with lock. If we * are not uptodate that implies - * somebody just created the page but - * is yet to initialize the same. So + * somebody just created the folio but + * is yet to initialize it. So * wait for it to initialize. */ - put_page(page); - page = find_or_create_page(inode->i_mapping, pnum, gfp); - if (page) { - if (WARN_RATELIMIT(page->mapping != inode->i_mapping, - "ext4: bitmap's paging->mapping != inode->i_mapping\n")) { + folio_put(folio); + folio = __filemap_get_folio(inode->i_mapping, pnum, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (!IS_ERR(folio)) { + if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, + "ext4: bitmap's mapping != inode->i_mapping\n")) { /* should never happen */ - unlock_page(page); + folio_unlock(folio); ret = -EINVAL; goto err; } - if (!PageUptodate(page)) { - ret = ext4_mb_init_cache(page, NULL, gfp); + if (!folio_test_uptodate(folio)) { + ret = ext4_mb_init_cache(folio, NULL, gfp); if (ret) { - unlock_page(page); + folio_unlock(folio); goto err; } - mb_cmp_bitmaps(e4b, page_address(page) + + mb_cmp_bitmaps(e4b, folio_address(folio) + (poff * sb->s_blocksize)); } - unlock_page(page); + folio_unlock(folio); } } - if (page == NULL) { - ret = -ENOMEM; + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto err; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } - /* Pages marked accessed already */ - e4b->bd_bitmap_page = page; - e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); + /* Folios marked accessed already */ + e4b->bd_bitmap_folio = folio; + e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); block++; pnum = block / blocks_per_page; poff = block % blocks_per_page; - page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); - if (page == NULL || !PageUptodate(page)) { - if (page) - put_page(page); - page = find_or_create_page(inode->i_mapping, pnum, gfp); - if (page) { - if (WARN_RATELIMIT(page->mapping != inode->i_mapping, - "ext4: buddy bitmap's page->mapping != inode->i_mapping\n")) { + folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); + if (IS_ERR(folio) || !folio_test_uptodate(folio)) { + if (!IS_ERR(folio)) + folio_put(folio); + folio = __filemap_get_folio(inode->i_mapping, pnum, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (!IS_ERR(folio)) { + if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, + "ext4: buddy bitmap's mapping != inode->i_mapping\n")) { /* should never happen */ - unlock_page(page); + folio_unlock(folio); ret = -EINVAL; goto err; } - if (!PageUptodate(page)) { - ret = ext4_mb_init_cache(page, e4b->bd_bitmap, + if (!folio_test_uptodate(folio)) { + ret = ext4_mb_init_cache(folio, e4b->bd_bitmap, gfp); if (ret) { - unlock_page(page); + folio_unlock(folio); goto err; } } - unlock_page(page); + folio_unlock(folio); } } - if (page == NULL) { - ret = -ENOMEM; + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto err; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } - /* Pages marked accessed already */ - e4b->bd_buddy_page = page; - e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); + /* Folios marked accessed already */ + e4b->bd_buddy_folio = folio; + e4b->bd_buddy = folio_address(folio) + (poff * sb->s_blocksize); return 0; err: - if (page) - put_page(page); - if (e4b->bd_bitmap_page) - put_page(e4b->bd_bitmap_page); + if (!IS_ERR_OR_NULL(folio)) + folio_put(folio); + if (e4b->bd_bitmap_folio) + folio_put(e4b->bd_bitmap_folio); e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; @@ -1732,10 +1730,10 @@ static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) { - if (e4b->bd_bitmap_page) - put_page(e4b->bd_bitmap_page); - if (e4b->bd_buddy_page) - put_page(e4b->bd_buddy_page); + if (e4b->bd_bitmap_folio) + folio_put(e4b->bd_bitmap_folio); + if (e4b->bd_buddy_folio) + folio_put(e4b->bd_buddy_folio); } @@ -1937,12 +1935,12 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(sbi, block); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, "freeing already freed block (bit %u); block bitmap corrupt.", block); - ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); return; } @@ -1984,8 +1982,7 @@ check: static int mb_find_extent(struct ext4_buddy *e4b, int block, int needed, struct ext4_free_extent *ex) { - int next = block; - int max, order; + int max, order, next; void *buddy; assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); @@ -2003,16 +2000,12 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block, /* find actual order */ order = mb_find_order_for_block(e4b, block); - block = block >> order; - ex->fe_len = 1 << order; - ex->fe_start = block << order; + ex->fe_len = (1 << order) - (block & ((1 << order) - 1)); + ex->fe_start = block; ex->fe_group = e4b->bd_group; - /* calc difference from given start */ - next = next - ex->fe_start; - ex->fe_len -= next; - ex->fe_start += next; + block = block >> order; while (needed > ex->fe_len && mb_find_buddy(e4b, order, &max)) { @@ -2050,13 +2043,12 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) int ord; int mlen = 0; int max = 0; - int cur; int start = ex->fe_start; int len = ex->fe_len; unsigned ret = 0; int len0 = len; void *buddy; - bool split = false; + int ord_start, ord_end; BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); BUG_ON(e4b->bd_group != ex->fe_group); @@ -2081,16 +2073,12 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) /* let's maintain buddy itself */ while (len) { - if (!split) - ord = mb_find_order_for_block(e4b, start); + ord = mb_find_order_for_block(e4b, start); if (((start >> ord) << ord) == start && len >= (1 << ord)) { /* the whole chunk may be allocated at once! */ mlen = 1 << ord; - if (!split) - buddy = mb_find_buddy(e4b, ord, &max); - else - split = false; + buddy = mb_find_buddy(e4b, ord, &max); BUG_ON((start >> ord) >= max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; @@ -2104,20 +2092,29 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) if (ret == 0) ret = len | (ord << 16); - /* we have to split large buddy */ BUG_ON(ord <= 0); buddy = mb_find_buddy(e4b, ord, &max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; - ord--; - cur = (start >> ord) & ~1U; - buddy = mb_find_buddy(e4b, ord, &max); - mb_clear_bit(cur, buddy); - mb_clear_bit(cur + 1, buddy); - e4b->bd_info->bb_counters[ord]++; - e4b->bd_info->bb_counters[ord]++; - split = true; + ord_start = (start >> ord) << ord; + ord_end = ord_start + (1 << ord); + /* first chunk */ + if (start > ord_start) + ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, + ord_start, start - ord_start, + e4b->bd_info); + + /* last chunk */ + if (start + len < ord_end) { + ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, + start + len, + ord_end - (start + len), + e4b->bd_info); + break; + } + len = start + len - ord_end; + start = ord_end; } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); @@ -2159,10 +2156,10 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, * double allocate blocks. The reference is dropped * in ext4_mb_release_context */ - ac->ac_bitmap_page = e4b->bd_bitmap_page; - get_page(ac->ac_bitmap_page); - ac->ac_buddy_page = e4b->bd_buddy_page; - get_page(ac->ac_buddy_page); + ac->ac_bitmap_folio = e4b->bd_bitmap_folio; + folio_get(ac->ac_bitmap_folio); + ac->ac_buddy_folio = e4b->bd_buddy_folio; + folio_get(ac->ac_buddy_folio); /* store last allocated for subsequent stream allocation */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { spin_lock(&sbi->s_md_lock); @@ -2354,7 +2351,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, ex.fe_logical = 0xDEADFA11; /* debug value */ if (max >= ac->ac_g_ex.fe_len && - ac->ac_g_ex.fe_len == EXT4_B2C(sbi, sbi->s_stripe)) { + ac->ac_g_ex.fe_len == EXT4_NUM_B2C(sbi, sbi->s_stripe)) { ext4_fsblk_t start; start = ext4_grp_offs_to_block(ac->ac_sb, &ex); @@ -2415,12 +2412,12 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, k = mb_find_next_zero_bit(buddy, max, 0); if (k >= max) { + ext4_mark_group_bitmap_corrupted(ac->ac_sb, + e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0, "%d free clusters of order %d. But found 0", grp->bb_counters[i], i); - ext4_mark_group_bitmap_corrupted(ac->ac_sb, - e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); break; } ac->ac_found++; @@ -2471,12 +2468,12 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, * free blocks even though group info says we * have free blocks */ + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But bitmap says 0", free); - ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); break; } @@ -2502,12 +2499,12 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, if (WARN_ON(ex.fe_len <= 0)) break; if (free < ex.fe_len) { + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But got %d blocks", free, ex.fe_len); - ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, - EXT4_GROUP_INFO_BBITMAP_CORRUPT); /* * The number of free blocks differs. This mostly * indicate that the bitmap is corrupt. So exit @@ -2551,7 +2548,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, do_div(a, sbi->s_stripe); i = (a * sbi->s_stripe) - first_group_block; - stripe = EXT4_B2C(sbi, sbi->s_stripe); + stripe = EXT4_NUM_B2C(sbi, sbi->s_stripe); i = EXT4_B2C(sbi, i); while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { if (!mb_test_bit(i, bitmap)) { @@ -2685,7 +2682,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, int ret; /* - * cr=CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic + * CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic * search to find large good chunks almost for free. If buddy * data is not ready, then this optimization makes no sense. But * we never skip the first block group in a flex_bg, since this @@ -2866,6 +2863,7 @@ repeat: group = ac->ac_g_ex.fe_group; ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; prefetch_grp = group; + nr = 0; for (i = 0, new_cr = cr; i < ngroups; i++, ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { @@ -2924,14 +2922,21 @@ repeat: ac->ac_groups_scanned++; if (cr == CR_POWER2_ALIGNED) ext4_mb_simple_scan_group(ac, &e4b); - else if ((cr == CR_GOAL_LEN_FAST || - cr == CR_BEST_AVAIL_LEN) && - sbi->s_stripe && - !(ac->ac_g_ex.fe_len % - EXT4_B2C(sbi, sbi->s_stripe))) - ext4_mb_scan_aligned(ac, &e4b); - else - ext4_mb_complex_scan_group(ac, &e4b); + else { + bool is_stripe_aligned = + (sbi->s_stripe >= + sbi->s_cluster_ratio) && + !(ac->ac_g_ex.fe_len % + EXT4_NUM_B2C(sbi, sbi->s_stripe)); + + if ((cr == CR_GOAL_LEN_FAST || + cr == CR_BEST_AVAIL_LEN) && + is_stripe_aligned) + ext4_mb_scan_aligned(ac, &e4b); + + if (ac->ac_status == AC_STATUS_CONTINUE) + ext4_mb_complex_scan_group(ac, &e4b); + } ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); @@ -3020,8 +3025,8 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) { struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group = (ext4_group_t) ((unsigned long) v); - int i; - int err, buddy_loaded = 0; + int i, err; + char nbuf[16]; struct ext4_buddy e4b; struct ext4_group_info *grinfo; unsigned char blocksize_bits = min_t(unsigned char, @@ -3048,17 +3053,17 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { - seq_printf(seq, "#%-5u: I/O error\n", group); + seq_printf(seq, "#%-5u: %s\n", group, ext4_decode_error(NULL, err, nbuf)); return 0; } - buddy_loaded = 1; + ext4_mb_unload_buddy(&e4b); } + /* + * We care only about free space counters in the group info and + * these are safe to access even after the buddy has been unloaded + */ memcpy(&sg, grinfo, i); - - if (buddy_loaded) - ext4_mb_unload_buddy(&e4b); - seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, sg.info.bb_fragments, sg.info.bb_first_free); for (i = 0; i <= 13; i++) @@ -3067,8 +3072,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) seq_puts(seq, " ]"); if (EXT4_MB_GRP_BBITMAP_CORRUPT(&sg.info)) seq_puts(seq, " Block bitmap corrupted!"); - seq_puts(seq, "\n"); - + seq_putc(seq, '\n'); return 0; } @@ -3191,7 +3195,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) } static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) -__acquires(&EXT4_SB(sb)->s_mb_rb_lock) { struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; @@ -3445,10 +3448,11 @@ static int ext4_mb_init_backend(struct super_block *sb) } if (sbi->s_mb_prefetch > ext4_get_groups_count(sb)) sbi->s_mb_prefetch = ext4_get_groups_count(sb); - /* now many real IOs to prefetch within a single allocation at cr=0 - * given cr=0 is an CPU-related optimization we shouldn't try to - * load too many groups, at some point we should start to use what - * we've got in memory. + /* + * now many real IOs to prefetch within a single allocation at + * CR_POWER2_ALIGNED. Given CR_POWER2_ALIGNED is an CPU-related + * optimization we shouldn't try to load too many groups, at some point + * we should start to use what we've got in memory. * with an average random access time 5ms, it'd take a second to get * 200 groups (* N with flex_bg), so let's make this limit 4 */ @@ -3662,7 +3666,8 @@ int ext4_mb_init(struct super_block *sb) spin_lock_init(&sbi->s_md_lock); sbi->s_mb_free_pending = 0; - INIT_LIST_HEAD(&sbi->s_freed_data_list); + INIT_LIST_HEAD(&sbi->s_freed_data_list[0]); + INIT_LIST_HEAD(&sbi->s_freed_data_list[1]); INIT_LIST_HEAD(&sbi->s_discard_list); INIT_WORK(&sbi->s_discard_work, ext4_discard_work); atomic_set(&sbi->s_retry_alloc_pending, 0); @@ -3698,7 +3703,7 @@ int ext4_mb_init(struct super_block *sb) */ if (sbi->s_stripe > 1) { sbi->s_mb_group_prealloc = roundup( - sbi->s_mb_group_prealloc, EXT4_B2C(sbi, sbi->s_stripe)); + sbi->s_mb_group_prealloc, EXT4_NUM_B2C(sbi, sbi->s_stripe)); } sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); @@ -3757,7 +3762,7 @@ static int ext4_mb_cleanup_pa(struct ext4_group_info *grp) return count; } -int ext4_mb_release(struct super_block *sb) +void ext4_mb_release(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; @@ -3833,13 +3838,10 @@ int ext4_mb_release(struct super_block *sb) } free_percpu(sbi->s_locality_groups); - - return 0; } static inline int ext4_issue_discard(struct super_block *sb, - ext4_group_t block_group, ext4_grpblk_t cluster, int count, - struct bio **biop) + ext4_group_t block_group, ext4_grpblk_t cluster, int count) { ext4_fsblk_t discard_block; @@ -3848,13 +3850,8 @@ static inline int ext4_issue_discard(struct super_block *sb, count = EXT4_C2B(EXT4_SB(sb), count); trace_ext4_discard_blocks(sb, (unsigned long long) discard_block, count); - if (biop) { - return __blkdev_issue_discard(sb->s_bdev, - (sector_t)discard_block << (sb->s_blocksize_bits - 9), - (sector_t)count << (sb->s_blocksize_bits - 9), - GFP_NOFS, biop); - } else - return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); + + return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); } static void ext4_free_data_in_buddy(struct super_block *sb, @@ -3893,8 +3890,8 @@ static void ext4_free_data_in_buddy(struct super_block *sb, /* No more items in the per group rb tree * balance refcounts from ext4_mb_free_metadata() */ - put_page(e4b.bd_buddy_page); - put_page(e4b.bd_bitmap_page); + folio_put(e4b.bd_buddy_folio); + folio_put(e4b.bd_bitmap_folio); } ext4_unlock_group(sb, entry->efd_group); ext4_mb_unload_buddy(&e4b); @@ -3911,19 +3908,10 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_free_data *entry, *tmp; LIST_HEAD(freed_data_list); - struct list_head *cut_pos = NULL; + struct list_head *s_freed_head = &sbi->s_freed_data_list[commit_tid & 1]; bool wake; - spin_lock(&sbi->s_md_lock); - list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) { - if (entry->efd_tid != commit_tid) - break; - cut_pos = &entry->efd_list; - } - if (cut_pos) - list_cut_position(&freed_data_list, &sbi->s_freed_data_list, - cut_pos); - spin_unlock(&sbi->s_md_lock); + list_replace_init(s_freed_head, &freed_data_list); list_for_each_entry(entry, &freed_data_list, efd_list) ext4_free_data_in_buddy(sb, entry); @@ -3981,6 +3969,111 @@ void ext4_exit_mballoc(void) ext4_groupinfo_destroy_slabs(); } +#define EXT4_MB_BITMAP_MARKED_CHECK 0x0001 +#define EXT4_MB_SYNC_UPDATE 0x0002 +static int +ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state, + ext4_group_t group, ext4_grpblk_t blkoff, + ext4_grpblk_t len, int flags, ext4_grpblk_t *ret_changed) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *bitmap_bh = NULL; + struct ext4_group_desc *gdp; + struct buffer_head *gdp_bh; + int err; + unsigned int i, already, changed = len; + + KUNIT_STATIC_STUB_REDIRECT(ext4_mb_mark_context, + handle, sb, state, group, blkoff, len, + flags, ret_changed); + + if (ret_changed) + *ret_changed = 0; + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) + return PTR_ERR(bitmap_bh); + + if (handle) { + BUFFER_TRACE(bitmap_bh, "getting write access"); + err = ext4_journal_get_write_access(handle, sb, bitmap_bh, + EXT4_JTR_NONE); + if (err) + goto out_err; + } + + err = -EIO; + gdp = ext4_get_group_desc(sb, group, &gdp_bh); + if (!gdp) + goto out_err; + + if (handle) { + BUFFER_TRACE(gdp_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, gdp_bh, + EXT4_JTR_NONE); + if (err) + goto out_err; + } + + ext4_lock_group(sb, group); + if (ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_clusters_after_init(sb, group, gdp)); + } + + if (flags & EXT4_MB_BITMAP_MARKED_CHECK) { + already = 0; + for (i = 0; i < len; i++) + if (mb_test_bit(blkoff + i, bitmap_bh->b_data) == + state) + already++; + changed = len - already; + } + + if (state) { + mb_set_bits(bitmap_bh->b_data, blkoff, len); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_group_clusters(sb, gdp) - changed); + } else { + mb_clear_bits(bitmap_bh->b_data, blkoff, len); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_group_clusters(sb, gdp) + changed); + } + + ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); + ext4_group_desc_csum_set(sb, group, gdp); + ext4_unlock_group(sb, group); + if (ret_changed) + *ret_changed = changed; + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, group); + struct flex_groups *fg = sbi_array_rcu_deref(sbi, + s_flex_groups, flex_group); + + if (state) + atomic64_sub(changed, &fg->free_clusters); + else + atomic64_add(changed, &fg->free_clusters); + } + + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + if (err) + goto out_err; + err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); + if (err) + goto out_err; + + if (flags & EXT4_MB_SYNC_UPDATE) { + sync_dirty_buffer(bitmap_bh); + sync_dirty_buffer(gdp_bh); + } + +out_err: + brelse(bitmap_bh); + return err; +} /* * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps @@ -3990,13 +4083,13 @@ static noinline_for_stack int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, handle_t *handle, unsigned int reserv_clstrs) { - struct buffer_head *bitmap_bh = NULL; struct ext4_group_desc *gdp; - struct buffer_head *gdp_bh; struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block; int err, len; + int flags = 0; + ext4_grpblk_t changed; BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(ac->ac_b_ex.fe_len <= 0); @@ -4004,32 +4097,13 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, sb = ac->ac_sb; sbi = EXT4_SB(sb); - bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); - if (IS_ERR(bitmap_bh)) { - return PTR_ERR(bitmap_bh); - } - - BUFFER_TRACE(bitmap_bh, "getting write access"); - err = ext4_journal_get_write_access(handle, sb, bitmap_bh, - EXT4_JTR_NONE); - if (err) - goto out_err; - - err = -EIO; - gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); + gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, NULL); if (!gdp) - goto out_err; - + return -EIO; ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, ext4_free_group_clusters(sb, gdp)); - BUFFER_TRACE(gdp_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE); - if (err) - goto out_err; - block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); - len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (!ext4_inode_block_valid(ac->ac_inode, block, len)) { ext4_error(sb, "Allocating blocks %llu-%llu which overlap " @@ -4038,41 +4112,29 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, * Fix the bitmap and return EFSCORRUPTED * We leak some of the blocks here. */ - ext4_lock_group(sb, ac->ac_b_ex.fe_group); - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, - ac->ac_b_ex.fe_len); - ext4_unlock_group(sb, ac->ac_b_ex.fe_group); - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + err = ext4_mb_mark_context(handle, sb, true, + ac->ac_b_ex.fe_group, + ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len, + 0, NULL); if (!err) err = -EFSCORRUPTED; - goto out_err; + return err; } - ext4_lock_group(sb, ac->ac_b_ex.fe_group); #ifdef AGGRESSIVE_CHECK - { - int i; - for (i = 0; i < ac->ac_b_ex.fe_len; i++) { - BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, - bitmap_bh->b_data)); - } - } + flags |= EXT4_MB_BITMAP_MARKED_CHECK; #endif - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, - ac->ac_b_ex.fe_len); - if (ext4_has_group_desc_csum(sb) && - (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - ext4_free_group_clusters_set(sb, gdp, - ext4_free_clusters_after_init(sb, - ac->ac_b_ex.fe_group, gdp)); - } - len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; - ext4_free_group_clusters_set(sb, gdp, len); - ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); - ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp); + err = ext4_mb_mark_context(handle, sb, true, ac->ac_b_ex.fe_group, + ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len, + flags, &changed); + + if (err && changed == 0) + return err; - ext4_unlock_group(sb, ac->ac_b_ex.fe_group); +#ifdef AGGRESSIVE_CHECK + BUG_ON(changed != ac->ac_b_ex.fe_len); +#endif percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); /* * Now reduce the dirty block count also. Should not go negative @@ -4082,21 +4144,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs); - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, - ac->ac_b_ex.fe_group); - atomic64_sub(ac->ac_b_ex.fe_len, - &sbi_array_rcu_deref(sbi, s_flex_groups, - flex_group)->free_clusters); - } - - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); - if (err) - goto out_err; - err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); - -out_err: - brelse(bitmap_bh); return err; } @@ -4105,17 +4152,13 @@ out_err: * blocks in bitmaps and update counters. */ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, - int len, int state) + int len, bool state) { - struct buffer_head *bitmap_bh = NULL; - struct ext4_group_desc *gdp; - struct buffer_head *gdp_bh; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group; ext4_grpblk_t blkoff; - int i, err = 0; - int already; - unsigned int clen, clen_changed, thisgrp_len; + int err = 0; + unsigned int clen, thisgrp_len; while (len > 0) { ext4_get_group_no_and_offset(sb, block, &group, &blkoff); @@ -4136,80 +4179,21 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, ext4_error(sb, "Marking blocks in system zone - " "Block = %llu, len = %u", block, thisgrp_len); - bitmap_bh = NULL; break; } - bitmap_bh = ext4_read_block_bitmap(sb, group); - if (IS_ERR(bitmap_bh)) { - err = PTR_ERR(bitmap_bh); - bitmap_bh = NULL; - break; - } - - err = -EIO; - gdp = ext4_get_group_desc(sb, group, &gdp_bh); - if (!gdp) - break; - - ext4_lock_group(sb, group); - already = 0; - for (i = 0; i < clen; i++) - if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) == - !state) - already++; - - clen_changed = clen - already; - if (state) - mb_set_bits(bitmap_bh->b_data, blkoff, clen); - else - mb_clear_bits(bitmap_bh->b_data, blkoff, clen); - if (ext4_has_group_desc_csum(sb) && - (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - ext4_free_group_clusters_set(sb, gdp, - ext4_free_clusters_after_init(sb, group, gdp)); - } - if (state) - clen = ext4_free_group_clusters(sb, gdp) - clen_changed; - else - clen = ext4_free_group_clusters(sb, gdp) + clen_changed; - - ext4_free_group_clusters_set(sb, gdp, clen); - ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); - ext4_group_desc_csum_set(sb, group, gdp); - - ext4_unlock_group(sb, group); - - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, group); - struct flex_groups *fg = sbi_array_rcu_deref(sbi, - s_flex_groups, flex_group); - - if (state) - atomic64_sub(clen_changed, &fg->free_clusters); - else - atomic64_add(clen_changed, &fg->free_clusters); - - } - - err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); - if (err) - break; - sync_dirty_buffer(bitmap_bh); - err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); - sync_dirty_buffer(gdp_bh); + err = ext4_mb_mark_context(NULL, sb, state, + group, blkoff, clen, + EXT4_MB_BITMAP_MARKED_CHECK | + EXT4_MB_SYNC_UPDATE, + NULL); if (err) break; block += thisgrp_len; len -= thisgrp_len; - brelse(bitmap_bh); BUG_ON(len < 0); } - - if (err) - brelse(bitmap_bh); } /* @@ -5331,7 +5315,7 @@ static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac) * the caller MUST hold group/inode locks. * TODO: optimize the case when there are no in-core structures yet */ -static noinline_for_stack int +static noinline_for_stack void ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, struct ext4_prealloc_space *pa) { @@ -5381,11 +5365,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, */ } atomic_add(free, &sbi->s_mb_discarded); - - return 0; } -static noinline_for_stack int +static noinline_for_stack void ext4_mb_release_group_pa(struct ext4_buddy *e4b, struct ext4_prealloc_space *pa) { @@ -5399,13 +5381,11 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) { ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu", e4b->bd_group, group, pa->pa_pstart); - return 0; + return; } mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); - - return 0; } /* @@ -5526,7 +5506,7 @@ out_dbg: * * FIXME!! Make sure it is valid at all the call sites */ -void ext4_discard_preallocations(struct inode *inode, unsigned int needed) +void ext4_discard_preallocations(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct super_block *sb = inode->i_sb; @@ -5538,9 +5518,8 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) struct rb_node *iter; int err; - if (!S_ISREG(inode->i_mode)) { + if (!S_ISREG(inode->i_mode)) return; - } if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) return; @@ -5548,15 +5527,12 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) mb_debug(sb, "discard preallocation for inode %lu\n", inode->i_ino); trace_ext4_discard_preallocations(inode, - atomic_read(&ei->i_prealloc_active), needed); - - if (needed == 0) - needed = UINT_MAX; + atomic_read(&ei->i_prealloc_active)); repeat: /* first, collect all pa's in the inode */ write_lock(&ei->i_prealloc_lock); - for (iter = rb_first(&ei->i_prealloc_node); iter && needed; + for (iter = rb_first(&ei->i_prealloc_node); iter; iter = rb_next(iter)) { pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); @@ -5580,7 +5556,6 @@ repeat: spin_unlock(&pa->pa_lock); rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); list_add(&pa->u.pa_tmp_list, &list); - needed--; continue; } @@ -5990,7 +5965,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) /* * release all resource we used in allocation */ -static int ext4_mb_release_context(struct ext4_allocation_context *ac) +static void ext4_mb_release_context(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *pa = ac->ac_pa; @@ -6020,14 +5995,13 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) ext4_mb_put_pa(ac, ac->ac_sb, pa); } - if (ac->ac_bitmap_page) - put_page(ac->ac_bitmap_page); - if (ac->ac_buddy_page) - put_page(ac->ac_buddy_page); + if (ac->ac_bitmap_folio) + folio_put(ac->ac_bitmap_folio); + if (ac->ac_buddy_folio) + folio_put(ac->ac_buddy_folio); if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) mutex_unlock(&ac->ac_lg->lg_mutex); ext4_mb_collect_stats(ac); - return 0; } static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) @@ -6142,7 +6116,7 @@ ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp) } block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i); - ext4_mb_mark_bb(sb, block, 1, 1); + ext4_mb_mark_bb(sb, block, 1, true); ar->len = 1; *errp = 0; @@ -6340,8 +6314,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct rb_node *parent = NULL, *new_node; BUG_ON(!ext4_handle_valid(handle)); - BUG_ON(e4b->bd_bitmap_page == NULL); - BUG_ON(e4b->bd_buddy_page == NULL); + BUG_ON(e4b->bd_bitmap_folio == NULL); + BUG_ON(e4b->bd_buddy_folio == NULL); new_node = &new_entry->efd_node; cluster = new_entry->efd_start_cluster; @@ -6352,8 +6326,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, * otherwise we'll refresh it from * on-disk bitmap and lose not-yet-available * blocks */ - get_page(e4b->bd_buddy_page); - get_page(e4b->bd_bitmap_page); + folio_get(e4b->bd_buddy_folio); + folio_get(e4b->bd_bitmap_folio); } while (*n) { parent = *n; @@ -6391,7 +6365,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, } spin_lock(&sbi->s_md_lock); - list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list); + list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]); sbi->s_mb_free_pending += clusters; spin_unlock(&sbi->s_md_lock); } @@ -6399,43 +6373,15 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, unsigned long count) { - struct buffer_head *bitmap_bh; struct super_block *sb = inode->i_sb; - struct ext4_group_desc *gdp; - struct buffer_head *gdp_bh; ext4_group_t group; ext4_grpblk_t blkoff; - int already_freed = 0, err, i; ext4_get_group_no_and_offset(sb, block, &group, &blkoff); - bitmap_bh = ext4_read_block_bitmap(sb, group); - if (IS_ERR(bitmap_bh)) { - pr_warn("Failed to read block bitmap\n"); - return; - } - gdp = ext4_get_group_desc(sb, group, &gdp_bh); - if (!gdp) - goto err_out; - - for (i = 0; i < count; i++) { - if (!mb_test_bit(blkoff + i, bitmap_bh->b_data)) - already_freed++; - } - mb_clear_bits(bitmap_bh->b_data, blkoff, count); - err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); - if (err) - goto err_out; - ext4_free_group_clusters_set( - sb, gdp, ext4_free_group_clusters(sb, gdp) + - count - already_freed); - ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); - ext4_group_desc_csum_set(sb, group, gdp); - ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); - sync_dirty_buffer(bitmap_bh); - sync_dirty_buffer(gdp_bh); - -err_out: - brelse(bitmap_bh); + ext4_mb_mark_context(NULL, sb, false, group, blkoff, count, + EXT4_MB_BITMAP_MARKED_CHECK | + EXT4_MB_SYNC_UPDATE, + NULL); } /** @@ -6451,19 +6397,17 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode, ext4_fsblk_t block, unsigned long count, int flags) { - struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; - struct ext4_group_desc *gdp; struct ext4_group_info *grp; unsigned int overflow; ext4_grpblk_t bit; - struct buffer_head *gd_bh; ext4_group_t block_group; struct ext4_sb_info *sbi; struct ext4_buddy e4b; unsigned int count_clusters; int err = 0; - int ret; + int mark_flags = 0; + ext4_grpblk_t changed; sbi = EXT4_SB(sb); @@ -6472,7 +6416,7 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode, ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); /* err = 0. ext4_std_error should be a no op */ - goto error_return; + goto error_out; } flags |= EXT4_FREE_BLOCKS_VALIDATED; @@ -6496,55 +6440,35 @@ do_more: flags &= ~EXT4_FREE_BLOCKS_VALIDATED; } count_clusters = EXT4_NUM_B2C(sbi, count); - bitmap_bh = ext4_read_block_bitmap(sb, block_group); - if (IS_ERR(bitmap_bh)) { - err = PTR_ERR(bitmap_bh); - bitmap_bh = NULL; - goto error_return; - } - gdp = ext4_get_group_desc(sb, block_group, &gd_bh); - if (!gdp) { - err = -EIO; - goto error_return; - } + trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); + + /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ + err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, + GFP_NOFS|__GFP_NOFAIL); + if (err) + goto error_out; if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); /* err = 0. ext4_std_error should be a no op */ - goto error_return; + goto error_clean; } - BUFFER_TRACE(bitmap_bh, "getting write access"); - err = ext4_journal_get_write_access(handle, sb, bitmap_bh, - EXT4_JTR_NONE); - if (err) - goto error_return; - - /* - * We are about to modify some metadata. Call the journal APIs - * to unshare ->b_data if a currently-committing transaction is - * using it - */ - BUFFER_TRACE(gd_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); - if (err) - goto error_return; #ifdef AGGRESSIVE_CHECK - { - int i; - for (i = 0; i < count_clusters; i++) - BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); - } + mark_flags |= EXT4_MB_BITMAP_MARKED_CHECK; #endif - trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); + err = ext4_mb_mark_context(handle, sb, false, block_group, bit, + count_clusters, mark_flags, &changed); - /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ - err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, - GFP_NOFS|__GFP_NOFAIL); - if (err) - goto error_return; + + if (err && changed == 0) + goto error_clean; + +#ifdef AGGRESSIVE_CHECK + BUG_ON(changed != count_clusters); +#endif /* * We need to make sure we don't reuse the freed block until after the @@ -6568,17 +6492,18 @@ do_more: new_entry->efd_tid = handle->h_transaction->t_tid; ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); ext4_mb_free_metadata(handle, &e4b, new_entry); } else { - /* need to update group_info->bb_free and bitmap - * with group lock held. generate_buddy look at - * them with group lock_held - */ if (test_opt(sb, DISCARD)) { err = ext4_issue_discard(sb, block_group, bit, - count_clusters, NULL); - if (err && err != -EOPNOTSUPP) + count_clusters); + /* + * Ignore EOPNOTSUPP error. This is consistent with + * what happens when using journal. + */ + if (err == -EOPNOTSUPP) + err = 0; + if (err) ext4_msg(sb, KERN_WARNING, "discard request in" " group:%u block:%d count:%lu failed" " with %d", block_group, bit, count, @@ -6588,23 +6513,11 @@ do_more: EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); mb_free_blocks(inode, &e4b, bit, count_clusters); } - ret = ext4_free_group_clusters(sb, gdp) + count_clusters; - ext4_free_group_clusters_set(sb, gdp, ret); - ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); - ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic64_add(count_clusters, - &sbi_array_rcu_deref(sbi, s_flex_groups, - flex_group)->free_clusters); - } - /* * on a bigalloc file system, defer the s_freeclusters_counter * update to the caller (ext4_remove_space and friends) so they @@ -6617,28 +6530,18 @@ do_more: count_clusters); } - ext4_mb_unload_buddy(&e4b); - - /* We dirtied the bitmap block */ - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); - - /* And the group descriptor block */ - BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); - ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); - if (!err) - err = ret; - if (overflow && !err) { block += count; count = overflow; - put_bh(bitmap_bh); + ext4_mb_unload_buddy(&e4b); /* The range changed so it's no longer validated */ flags &= ~EXT4_FREE_BLOCKS_VALIDATED; goto do_more; } -error_return: - brelse(bitmap_bh); + +error_clean: + ext4_mb_unload_buddy(&e4b); +error_out: ext4_std_error(sb, err); } @@ -6736,7 +6639,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, for (i = 0; i < count; i++) { cond_resched(); if (is_metadata) - bh = sb_find_get_block(inode->i_sb, block + i); + bh = sb_find_get_block_nonatomic(inode->i_sb, + block + i); ext4_forget(handle, is_metadata, inode, bh, block + i); } } @@ -6756,23 +6660,19 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count) { - struct buffer_head *bitmap_bh = NULL; - struct buffer_head *gd_bh; ext4_group_t block_group; ext4_grpblk_t bit; - unsigned int i; - struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_buddy e4b; - int err = 0, ret, free_clusters_count; - ext4_grpblk_t clusters_freed; + int err = 0; ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block); ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1); unsigned long cluster_count = last_cluster - first_cluster + 1; + ext4_grpblk_t changed; ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); - if (count == 0) + if (cluster_count == 0) return 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); @@ -6784,99 +6684,39 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_warning(sb, "too many blocks added to group %u", block_group); err = -EINVAL; - goto error_return; - } - - bitmap_bh = ext4_read_block_bitmap(sb, block_group); - if (IS_ERR(bitmap_bh)) { - err = PTR_ERR(bitmap_bh); - bitmap_bh = NULL; - goto error_return; + goto error_out; } - desc = ext4_get_group_desc(sb, block_group, &gd_bh); - if (!desc) { - err = -EIO; - goto error_return; - } + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) + goto error_out; if (!ext4_sb_block_valid(sb, NULL, block, count)) { ext4_error(sb, "Adding blocks in system zones - " "Block = %llu, count = %lu", block, count); err = -EINVAL; - goto error_return; + goto error_clean; } - BUFFER_TRACE(bitmap_bh, "getting write access"); - err = ext4_journal_get_write_access(handle, sb, bitmap_bh, - EXT4_JTR_NONE); - if (err) - goto error_return; + err = ext4_mb_mark_context(handle, sb, false, block_group, bit, + cluster_count, EXT4_MB_BITMAP_MARKED_CHECK, + &changed); + if (err && changed == 0) + goto error_clean; - /* - * We are about to modify some metadata. Call the journal APIs - * to unshare ->b_data if a currently-committing transaction is - * using it - */ - BUFFER_TRACE(gd_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); - if (err) - goto error_return; - - for (i = 0, clusters_freed = 0; i < cluster_count; i++) { - BUFFER_TRACE(bitmap_bh, "clear bit"); - if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { - ext4_error(sb, "bit already cleared for block %llu", - (ext4_fsblk_t)(block + i)); - BUFFER_TRACE(bitmap_bh, "bit already cleared"); - } else { - clusters_freed++; - } - } + if (changed != cluster_count) + ext4_error(sb, "bit already cleared in group %u", block_group); - err = ext4_mb_load_buddy(sb, block_group, &e4b); - if (err) - goto error_return; - - /* - * need to update group_info->bb_free and bitmap - * with group lock held. generate_buddy look at - * them with group lock_held - */ ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, cluster_count); mb_free_blocks(NULL, &e4b, bit, cluster_count); - free_clusters_count = clusters_freed + - ext4_free_group_clusters(sb, desc); - ext4_free_group_clusters_set(sb, desc, free_clusters_count); - ext4_block_bitmap_csum_set(sb, desc, bitmap_bh); - ext4_group_desc_csum_set(sb, block_group, desc); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, - clusters_freed); - - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic64_add(clusters_freed, - &sbi_array_rcu_deref(sbi, s_flex_groups, - flex_group)->free_clusters); - } + changed); +error_clean: ext4_mb_unload_buddy(&e4b); - - /* We dirtied the bitmap block */ - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); - - /* And the group descriptor block */ - BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); - ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); - if (!err) - err = ret; - -error_return: - brelse(bitmap_bh); +error_out: ext4_std_error(sb, err); return err; } @@ -6915,7 +6755,7 @@ __acquires(bitlock) */ mb_mark_used(e4b, &ex); ext4_unlock_group(sb, group); - ret = ext4_issue_discard(sb, group, start, count, NULL); + ret = ext4_issue_discard(sb, group, start, count); ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); return ret; @@ -7207,3 +7047,7 @@ out_unload: return error; } + +#ifdef CONFIG_EXT4_KUNIT_TESTS +#include "mballoc-test.c" +#endif diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index dd16050022f5..f8280de3e882 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -187,12 +187,11 @@ struct ext4_allocation_context { struct ext4_free_extent ac_f_ex; /* - * goal len can change in CR1.5, so save the original len. This is - * used while adjusting the PA window and for accounting. + * goal len can change in CR_BEST_AVAIL_LEN, so save the original len. + * This is used while adjusting the PA window and for accounting. */ ext4_grpblk_t ac_orig_goal_len; - __u32 ac_groups_considered; __u32 ac_flags; /* allocation hints */ __u32 ac_groups_linear_remaining; __u16 ac_groups_scanned; @@ -205,8 +204,8 @@ struct ext4_allocation_context { __u8 ac_2order; /* if request is to allocate 2^N blocks and * N > 0, the field stores N, otherwise 0 */ __u8 ac_op; /* operation, for history only */ - struct page *ac_bitmap_page; - struct page *ac_buddy_page; + struct folio *ac_bitmap_folio; + struct folio *ac_buddy_folio; struct ext4_prealloc_space *ac_pa; struct ext4_locality_group *ac_lg; }; @@ -216,9 +215,9 @@ struct ext4_allocation_context { #define AC_STATUS_BREAK 3 struct ext4_buddy { - struct page *bd_buddy_page; + struct folio *bd_buddy_folio; void *bd_buddy; - struct page *bd_bitmap_page; + struct folio *bd_bitmap_folio; void *bd_bitmap; struct ext4_group_info *bd_info; struct super_block *bd_sb; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index a5e1492bbaaa..1b0dfd963d3f 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -37,7 +37,6 @@ static int finish_range(handle_t *handle, struct inode *inode, path = ext4_find_extent(inode, lb->first_block, NULL, 0); if (IS_ERR(path)) { retval = PTR_ERR(path); - path = NULL; goto err_out; } @@ -53,7 +52,9 @@ static int finish_range(handle_t *handle, struct inode *inode, retval = ext4_datasem_ensure_credits(handle, inode, needed, needed, 0); if (retval < 0) goto err_out; - retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0); + path = ext4_ext_insert_extent(handle, inode, path, &newext, 0); + if (IS_ERR(path)) + retval = PTR_ERR(path); err_out: up_write((&EXT4_I(inode)->i_data_sem)); ext4_free_ext_path(path); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 5e6b07b34960..898443e98efc 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -17,26 +17,23 @@ * get_ext_path() - Find an extent path for designated logical block number. * @inode: inode to be searched * @lblock: logical block number to find an extent path - * @ppath: pointer to an extent path pointer (for output) + * @path: pointer to an extent path * - * ext4_find_extent wrapper. Return 0 on success, or a negative error value - * on failure. + * ext4_find_extent wrapper. Return an extent path pointer on success, + * or an error pointer on failure. */ -static inline int +static inline struct ext4_ext_path * get_ext_path(struct inode *inode, ext4_lblk_t lblock, - struct ext4_ext_path **ppath) + struct ext4_ext_path *path) { - struct ext4_ext_path *path; - - path = ext4_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE); + path = ext4_find_extent(inode, lblock, path, EXT4_EX_NOCACHE); if (IS_ERR(path)) - return PTR_ERR(path); + return path; if (path[ext_depth(inode)].p_ext == NULL) { ext4_free_ext_path(path); - *ppath = NULL; - return -ENODATA; + return ERR_PTR(-ENODATA); } - return 0; + return path; } /** @@ -94,9 +91,11 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, int ret = 0; ext4_lblk_t last = from + count; while (from < last) { - *err = get_ext_path(inode, from, &path); - if (*err) - goto out; + path = get_ext_path(inode, from, path); + if (IS_ERR(path)) { + *err = PTR_ERR(path); + return ret; + } ext = path[ext_depth(inode)].p_ext; if (unwritten != ext4_ext_is_unwritten(ext)) goto out; @@ -183,14 +182,14 @@ static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to) blocksize = i_blocksize(inode); head = folio_buffers(folio); - if (!head) { - create_empty_buffers(&folio->page, blocksize, 0); - head = folio_buffers(folio); - } + if (!head) + head = create_empty_buffers(folio, blocksize, 0); - block = (sector_t)folio->index << (PAGE_SHIFT - inode->i_blkbits); - for (bh = head, block_start = 0; bh != head || !block_start; - block++, block_start = block_end, bh = bh->b_this_page) { + block = folio_pos(folio) >> inode->i_blkbits; + block_end = 0; + bh = head; + do { + block_start = block_end; block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (!buffer_uptodate(bh)) @@ -216,7 +215,8 @@ static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to) } ext4_read_bh_nowait(bh, 0, NULL, false); nr++; - } + } while (block++, (bh = bh->b_this_page) != head); + /* No io required */ if (!nr) goto out; @@ -386,9 +386,10 @@ data_copy: } /* Perform all necessary steps similar write_begin()/write_end() * but keeping in mind that i_size will not change */ - if (!folio_buffers(folio[0])) - create_empty_buffers(&folio[0]->page, 1 << orig_inode->i_blkbits, 0); bh = folio_buffers(folio[0]); + if (!bh) + bh = create_empty_buffers(folio[0], + 1 << orig_inode->i_blkbits, 0); for (i = 0; i < data_offset_in_page; i++) bh = bh->b_this_page; for (i = 0; i < block_len_in_page; i++) { @@ -633,9 +634,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, int offset_in_page; int unwritten, cur_len; - ret = get_ext_path(orig_inode, o_start, &path); - if (ret) + path = get_ext_path(orig_inode, o_start, path); + if (IS_ERR(path)) { + ret = PTR_ERR(path); goto out; + } ex = path[path->p_depth].p_ext; cur_blk = le32_to_cpu(ex->ee_block); cur_len = ext4_ext_get_actual_len(ex); @@ -693,8 +696,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, out: if (*moved_len) { - ext4_discard_preallocations(orig_inode, 0); - ext4_discard_preallocations(donor_inode, 0); + ext4_discard_preallocations(orig_inode); + ext4_discard_preallocations(donor_inode); } ext4_free_ext_path(path); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 4de1f61bba76..286f8fcb74cc 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1391,62 +1391,11 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) } #if IS_ENABLED(CONFIG_UNICODE) -/* - * Test whether a case-insensitive directory entry matches the filename - * being searched for. If quick is set, assume the name being looked up - * is already in the casefolded form. - * - * Returns: 0 if the directory entry matches, more than 0 if it - * doesn't match or less than zero on error. - */ -static int ext4_ci_compare(const struct inode *parent, const struct qstr *name, - u8 *de_name, size_t de_name_len, bool quick) -{ - const struct super_block *sb = parent->i_sb; - const struct unicode_map *um = sb->s_encoding; - struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len); - struct qstr entry = QSTR_INIT(de_name, de_name_len); - int ret; - - if (IS_ENCRYPTED(parent)) { - const struct fscrypt_str encrypted_name = - FSTR_INIT(de_name, de_name_len); - - decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL); - if (!decrypted_name.name) - return -ENOMEM; - ret = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name, - &decrypted_name); - if (ret < 0) - goto out; - entry.name = decrypted_name.name; - entry.len = decrypted_name.len; - } - - if (quick) - ret = utf8_strncasecmp_folded(um, name, &entry); - else - ret = utf8_strncasecmp(um, name, &entry); - if (ret < 0) { - /* Handle invalid character sequence as either an error - * or as an opaque byte sequence. - */ - if (sb_has_strict_encoding(sb)) - ret = -EINVAL; - else if (name->len != entry.len) - ret = 1; - else - ret = !!memcmp(name->name, entry.name, entry.len); - } -out: - kfree(decrypted_name.name); - return ret; -} - int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct ext4_filename *name) { - struct fscrypt_str *cf_name = &name->cf_name; + struct qstr *cf_name = &name->cf_name; + unsigned char *buf; struct dx_hash_info *hinfo = &name->hinfo; int len; @@ -1456,18 +1405,18 @@ int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, return 0; } - cf_name->name = kmalloc(EXT4_NAME_LEN, GFP_NOFS); - if (!cf_name->name) + buf = kmalloc(EXT4_NAME_LEN, GFP_NOFS); + if (!buf) return -ENOMEM; - len = utf8_casefold(dir->i_sb->s_encoding, - iname, cf_name->name, - EXT4_NAME_LEN); + len = utf8_casefold(dir->i_sb->s_encoding, iname, buf, EXT4_NAME_LEN); if (len <= 0) { - kfree(cf_name->name); - cf_name->name = NULL; + kfree(buf); + buf = NULL; } + cf_name->name = buf; cf_name->len = (unsigned) len; + if (!IS_ENCRYPTED(dir)) return 0; @@ -1503,22 +1452,29 @@ static bool ext4_match(struct inode *parent, #if IS_ENABLED(CONFIG_UNICODE) if (IS_CASEFOLDED(parent) && (!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) { - if (fname->cf_name.name) { - struct qstr cf = {.name = fname->cf_name.name, - .len = fname->cf_name.len}; - if (IS_ENCRYPTED(parent)) { - if (fname->hinfo.hash != EXT4_DIRENT_HASH(de) || - fname->hinfo.minor_hash != - EXT4_DIRENT_MINOR_HASH(de)) { - - return false; - } - } - return !ext4_ci_compare(parent, &cf, de->name, - de->name_len, true); - } - return !ext4_ci_compare(parent, fname->usr_fname, de->name, - de->name_len, false); + /* + * Just checking IS_ENCRYPTED(parent) below is not + * sufficient to decide whether one can use the hash for + * skipping the string comparison, because the key might + * have been added right after + * ext4_fname_setup_ci_filename(). In this case, a hash + * mismatch will be a false negative. Therefore, make + * sure cf_name was properly initialized before + * considering the calculated hash. + */ + if (IS_ENCRYPTED(parent) && fname->cf_name.name && + (fname->hinfo.hash != EXT4_DIRENT_HASH(de) || + fname->hinfo.minor_hash != EXT4_DIRENT_MINOR_HASH(de))) + return false; + /* + * Treat comparison errors as not a match. The + * only case where it happens is on a disk + * corruption or ENOMEM. + */ + + return generic_ci_match(parent, fname->usr_fname, + &fname->cf_name, de->name, + de->name_len) > 0; } #endif @@ -1618,7 +1574,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir, &has_inline_data); if (inlined) *inlined = has_inline_data; - if (has_inline_data) + if (has_inline_data || IS_ERR(ret)) goto cleanup_and_exit; } @@ -1765,7 +1721,6 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir, struct buffer_head *bh; err = ext4_fname_prepare_lookup(dir, dentry, &fname); - generic_set_encrypted_ci_d_ops(dentry); if (err == -ENOENT) return NULL; if (err) @@ -1873,8 +1828,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi } } -#if IS_ENABLED(CONFIG_UNICODE) - if (!inode && IS_CASEFOLDED(dir)) { + if (IS_ENABLED(CONFIG_UNICODE) && !inode && IS_CASEFOLDED(dir)) { /* Eventually we want to call d_add_ci(dentry, NULL) * for negative dentries in the encoding case as * well. For now, prevent the negative dentry @@ -1882,7 +1836,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi */ return NULL; } -#endif + return d_splice_alias(inode, dentry); } @@ -2041,7 +1995,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, * split it in half by count; each resulting block will have at least * half the space free. */ - if (i > 0) + if (i >= 0) split = count - move; else split = count/2; @@ -2210,7 +2164,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, * happen is that the times are slightly out of date * and/or different from the directory change time. */ - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); ext4_update_dx_flag(dir); inode_inc_iversion(dir); err2 = ext4_mark_inode_dirty(handle, dir); @@ -2329,8 +2283,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, top = data2 + len; while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) { if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len, - (data2 + (blocksize - csum_size) - - (char *) de))) { + (char *)de - data2)) { brelse(bh2); brelse(bh); return -EFSCORRUPTED; @@ -2438,8 +2391,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, sb = dir->i_sb; blocksize = sb->s_blocksize; - if (!dentry->d_name.len) - return -EINVAL; if (fscrypt_is_nokey_name(dentry)) return -ENOKEY; @@ -3248,7 +3199,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) * recovery. */ inode->i_size = 0; ext4_orphan_add(handle, inode); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_set_ctime_current(inode); retval = ext4_mark_inode_dirty(handle, inode); if (retval) @@ -3258,16 +3209,14 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) ext4_fc_track_unlink(handle, dentry); retval = ext4_mark_inode_dirty(handle, dir); -#if IS_ENABLED(CONFIG_UNICODE) /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the * negative dentries at ext4_lookup(), when it is better * supported by the VFS for the CI case. */ - if (IS_CASEFOLDED(dir)) + if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_invalidate(dentry); -#endif end_rmdir: brelse(bh); @@ -3323,7 +3272,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name, retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto out_handle; - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); ext4_update_dx_flag(dir); retval = ext4_mark_inode_dirty(handle, dir); if (retval) @@ -3369,16 +3318,15 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) goto out_trace; retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry), dentry); -#if IS_ENABLED(CONFIG_UNICODE) + /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the * negative dentries at ext4_lookup(), when it is better * supported by the VFS for the CI case. */ - if (IS_CASEFOLDED(dir)) + if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_invalidate(dentry); -#endif out_trace: trace_ext4_unlink_exit(dentry, retval); @@ -3635,10 +3583,14 @@ struct ext4_renament { int dir_inlined; }; -static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent) +static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent, bool is_cross) { int retval; + ent->is_dir = true; + if (!is_cross) + return 0; + ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode, &retval, &ent->parent_de, &ent->dir_inlined); @@ -3656,6 +3608,9 @@ static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent, { int retval; + if (!ent->dir_bh) + return 0; + ent->parent_de->inode = cpu_to_le32(dir_ino); BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata"); if (!ent->dir_inlined) { @@ -3691,7 +3646,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, if (ext4_has_feature_filetype(ent->dir->i_sb)) ent->de->file_type = file_type; inode_inc_iversion(ent->dir); - ent->dir->i_mtime = inode_set_ctime_current(ent->dir); + inode_set_mtime_to_ts(ent->dir, inode_set_ctime_current(ent->dir)); retval = ext4_mark_inode_dirty(handle, ent->dir); BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); if (!ent->inlined) { @@ -3944,7 +3899,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir)) goto end_rename; } - retval = ext4_rename_dir_prepare(handle, &old); + retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir); if (retval) goto end_rename; } @@ -4006,9 +3961,9 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, ext4_dec_count(new.inode); inode_set_ctime_current(new.inode); } - old.dir->i_mtime = inode_set_ctime_current(old.dir); + inode_set_mtime_to_ts(old.dir, inode_set_ctime_current(old.dir)); ext4_update_dx_flag(old.dir); - if (old.dir_bh) { + if (old.is_dir) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); if (retval) goto end_rename; @@ -4031,7 +3986,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (unlikely(retval)) goto end_rename; - if (S_ISDIR(old.inode->i_mode)) { + if (old.is_dir) { /* * We disable fast commits here that's because the * replay code is not yet capable of changing dot dot @@ -4158,14 +4113,12 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, ext4_handle_sync(handle); if (S_ISDIR(old.inode->i_mode)) { - old.is_dir = true; - retval = ext4_rename_dir_prepare(handle, &old); + retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir); if (retval) goto end_rename; } if (S_ISDIR(new.inode->i_mode)) { - new.is_dir = true; - retval = ext4_rename_dir_prepare(handle, &new); + retval = ext4_rename_dir_prepare(handle, &new, new.dir != old.dir); if (retval) goto end_rename; } diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index e5b47dda3317..a23b0c01f809 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -590,8 +590,9 @@ int ext4_init_orphan_info(struct super_block *sb) } oi->of_blocks = inode->i_size >> sb->s_blocksize_bits; oi->of_csum_seed = EXT4_I(inode)->i_csum_seed; - oi->of_binfo = kmalloc(oi->of_blocks*sizeof(struct ext4_orphan_block), - GFP_KERNEL); + oi->of_binfo = kmalloc_array(oi->of_blocks, + sizeof(struct ext4_orphan_block), + GFP_KERNEL); if (!oi->of_binfo) { ret = -ENOMEM; goto out_put; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 7ab4f5a9bf5b..cb023922c93c 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -181,14 +181,25 @@ static int ext4_end_io_end(ext4_io_end_t *io_end) "list->prev 0x%p\n", io_end, inode->i_ino, io_end->list.next, io_end->list.prev); - io_end->handle = NULL; /* Following call will use up the handle */ - ret = ext4_convert_unwritten_io_end_vec(handle, io_end); + /* + * Do not convert the unwritten extents if data writeback fails, + * or stale data may be exposed. + */ + io_end->handle = NULL; /* Following call will use up the handle */ + if (unlikely(io_end->flag & EXT4_IO_END_FAILED)) { + ret = -EIO; + if (handle) + jbd2_journal_free_reserved(handle); + } else { + ret = ext4_convert_unwritten_io_end_vec(handle, io_end); + } if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) { ext4_msg(inode->i_sb, KERN_EMERG, "failed to convert unwritten extents to written " "extents -- potential data loss! " "(inode %lu, error %d)", inode->i_ino, ret); } + ext4_clear_io_unwritten_flag(io_end); ext4_release_io_end(io_end); return ret; @@ -344,6 +355,7 @@ static void ext4_end_bio(struct bio *bio) bio->bi_status, inode->i_ino, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); + io_end->flag |= EXT4_IO_END_FAILED; mapping_set_error(inode->i_mapping, blk_status_to_errno(bio->bi_status)); } @@ -421,7 +433,7 @@ submit_and_retry: io_submit_init_bio(io, bh); if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh))) goto submit_and_retry; - wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size); + wbc_account_cgroup_owner(io->io_wbc, folio, bh->b_size); io->io_next_block++; } @@ -441,7 +453,7 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, BUG_ON(folio_test_writeback(folio)); /* - * Comments copied from block_write_full_page: + * Comments copied from block_write_full_folio: * * The folio straddles i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 8cb83e7b699b..5d3a9dc9a32d 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -70,15 +70,8 @@ static void __read_end_io(struct bio *bio) { struct folio_iter fi; - bio_for_each_folio_all(fi, bio) { - struct folio *folio = fi.folio; - - if (bio->bi_status) - folio_clear_uptodate(folio); - else - folio_mark_uptodate(folio); - folio_unlock(folio); - } + bio_for_each_folio_all(fi, bio) + folio_end_read(fi.folio, bio->bi_status == 0); if (bio->bi_private) mempool_free(bio->bi_private, bio_post_read_ctx_pool); bio_put(bio); @@ -228,7 +221,7 @@ int ext4_mpage_readpages(struct inode *inode, sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; - sector_t blocks[MAX_BUF_PER_PAGE]; + sector_t first_block; unsigned page_block; struct block_device *bdev = inode->i_sb->s_bdev; int length; @@ -270,6 +263,7 @@ int ext4_mpage_readpages(struct inode *inode, unsigned map_offset = block_in_file - map.m_lblk; unsigned last = map.m_len - map_offset; + first_block = map.m_pblk + map_offset; for (relative_block = 0; ; relative_block++) { if (relative_block == last) { /* needed? */ @@ -278,8 +272,6 @@ int ext4_mpage_readpages(struct inode *inode, } if (page_block == blocks_per_page) break; - blocks[page_block] = map.m_pblk + map_offset + - relative_block; page_block++; block_in_file++; } @@ -314,7 +306,9 @@ int ext4_mpage_readpages(struct inode *inode, goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ - if (page_block && blocks[page_block-1] != map.m_pblk-1) + if (!page_block) + first_block = map.m_pblk; + else if (first_block + page_block != map.m_pblk) goto confused; for (relative_block = 0; ; relative_block++) { if (relative_block == map.m_len) { @@ -323,7 +317,6 @@ int ext4_mpage_readpages(struct inode *inode, break; } else if (page_block == blocks_per_page) break; - blocks[page_block] = map.m_pblk+relative_block; page_block++; block_in_file++; } @@ -335,8 +328,7 @@ int ext4_mpage_readpages(struct inode *inode, if (ext4_need_verity(inode, folio->index) && !fsverity_verify_folio(folio)) goto set_error_page; - folio_mark_uptodate(folio); - folio_unlock(folio); + folio_end_read(folio, true); continue; } } else if (fully_mapped) { @@ -347,7 +339,7 @@ int ext4_mpage_readpages(struct inode *inode, * This folio will go to BIO. Do we need to send this * BIO off first? */ - if (bio && (last_block_in_bio != blocks[0] - 1 || + if (bio && (last_block_in_bio != first_block - 1 || !fscrypt_mergeable_bio(bio, inode, next_block))) { submit_and_realloc: submit_bio(bio); @@ -363,7 +355,7 @@ int ext4_mpage_readpages(struct inode *inode, fscrypt_set_bio_crypt_ctx(bio, inode, next_block, GFP_KERNEL); ext4_set_bio_post_read_ctx(bio, inode, folio->index); - bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); + bio->bi_iter.bi_sector = first_block << (blkbits - 9); bio->bi_end_io = mpage_end_io; if (rac) bio->bi_opf |= REQ_RAHEAD; @@ -379,7 +371,7 @@ int ext4_mpage_readpages(struct inode *inode, submit_bio(bio); bio = NULL; } else - last_block_in_bio = blocks[blocks_per_page - 1]; + last_block_in_bio = first_block + blocks_per_page - 1; continue; confused: if (bio) { diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b34007541e08..72f77f78ae8d 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -10,8 +10,6 @@ */ -#define EXT4FS_DEBUG - #include <linux/errno.h> #include <linux/slab.h> #include <linux/jiffies.h> @@ -57,7 +55,7 @@ int ext4_resize_begin(struct super_block *sb) * If the reserved GDT blocks is non-zero, the resize_inode feature * should always be set. */ - if (EXT4_SB(sb)->s_es->s_reserved_gdt_blocks && + if (sbi->s_es->s_reserved_gdt_blocks && !ext4_has_feature_resize_inode(sb)) { ext4_error(sb, "resize_inode disabled but reserved GDT blocks non-zero"); return -EFSCORRUPTED; @@ -69,9 +67,9 @@ int ext4_resize_begin(struct super_block *sb) * bad time to do it anyways. */ if (EXT4_B2C(sbi, sbi->s_sbh->b_blocknr) != - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { + le32_to_cpu(sbi->s_es->s_first_data_block)) { ext4_warning(sb, "won't resize using backup superblock at %llu", - (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); + (unsigned long long)sbi->s_sbh->b_blocknr); return -EPERM; } @@ -79,7 +77,7 @@ int ext4_resize_begin(struct super_block *sb) * We are not allowed to do online-resizing on a filesystem mounted * with error, because it can destroy the filesystem easily. */ - if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + if (sbi->s_mount_state & EXT4_ERROR_FS) { ext4_warning(sb, "There are errors in the filesystem, " "so online resizing is not allowed"); return -EPERM; @@ -91,7 +89,7 @@ int ext4_resize_begin(struct super_block *sb) } if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING, - &EXT4_SB(sb)->s_ext4_flags)) + &sbi->s_ext4_flags)) ret = -EBUSY; return ret; @@ -106,18 +104,6 @@ int ext4_resize_end(struct super_block *sb, bool update_backups) return 0; } -static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb, - ext4_group_t group) { - return (group >> EXT4_DESC_PER_BLOCK_BITS(sb)) << - EXT4_DESC_PER_BLOCK_BITS(sb); -} - -static ext4_fsblk_t ext4_meta_bg_first_block_no(struct super_block *sb, - ext4_group_t group) { - group = ext4_meta_bg_first_group(sb, group); - return ext4_group_first_block_no(sb, group); -} - static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb, ext4_group_t group) { ext4_grpblk_t overhead; @@ -154,8 +140,9 @@ static int verify_group_input(struct super_block *sb, overhead = ext4_group_overhead_blocks(sb, group); metaend = start + overhead; - input->free_clusters_count = free_blocks_count = - input->blocks_count - 2 - overhead - sbi->s_itb_per_group; + free_blocks_count = input->blocks_count - 2 - overhead - + sbi->s_itb_per_group; + input->free_clusters_count = EXT4_B2C(sbi, free_blocks_count); if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks " @@ -243,23 +230,35 @@ struct ext4_new_flex_group_data { #define MAX_RESIZE_BG 16384 /* - * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of - * @flexbg_size. + * alloc_flex_gd() allocates an ext4_new_flex_group_data that satisfies the + * resizing from @o_group to @n_group, its size is typically @flexbg_size. * * Returns NULL on failure otherwise address of the allocated structure. */ -static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned int flexbg_size) +static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned int flexbg_size, + ext4_group_t o_group, ext4_group_t n_group) { + ext4_group_t last_group; + unsigned int max_resize_bg; struct ext4_new_flex_group_data *flex_gd; flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS); if (flex_gd == NULL) goto out3; - if (unlikely(flexbg_size > MAX_RESIZE_BG)) - flex_gd->resize_bg = MAX_RESIZE_BG; - else - flex_gd->resize_bg = flexbg_size; + max_resize_bg = umin(flexbg_size, MAX_RESIZE_BG); + flex_gd->resize_bg = max_resize_bg; + + /* Avoid allocating large 'groups' array if not needed */ + last_group = o_group | (flex_gd->resize_bg - 1); + if (n_group <= last_group) + flex_gd->resize_bg = 1 << fls(n_group - o_group); + else if (n_group - last_group < flex_gd->resize_bg) + flex_gd->resize_bg = 1 << max(fls(last_group - o_group), + fls(n_group - last_group)); + + if (WARN_ON_ONCE(flex_gd->resize_bg > max_resize_bg)) + flex_gd->resize_bg = max_resize_bg; flex_gd->groups = kmalloc_array(flex_gd->resize_bg, sizeof(struct ext4_new_group_data), @@ -468,8 +467,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, ext4_debug("mark clusters [%llu-%llu] used\n", first_cluster, last_cluster); - for (count2 = count; count > 0; - count -= count2, first_cluster += count2) { + for (; count > 0; count -= count2, first_cluster += count2) { ext4_fsblk_t start; struct buffer_head *bh; ext4_group_t group; @@ -617,7 +615,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb, } handle_itb: - /* Initialize group tables of the grop @group */ + /* Initialize group tables of the group @group */ if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED)) goto handle_bb; @@ -707,16 +705,14 @@ handle_ib: block = start; } - if (count) { - err = set_flexbg_block_bitmap(sb, handle, - flex_gd, - EXT4_B2C(sbi, start), - EXT4_B2C(sbi, - start + count - - 1)); - if (err) - goto out; - } + err = set_flexbg_block_bitmap(sb, handle, + flex_gd, + EXT4_B2C(sbi, start), + EXT4_B2C(sbi, + start + count + - 1)); + if (err) + goto out; } out: @@ -955,7 +951,13 @@ errout: } /* - * add_new_gdb_meta_bg is the sister of add_new_gdb. + * If there is no available space in the existing block group descriptors for + * the new block group and there are no reserved block group descriptors, then + * the meta_bg feature will get enabled, and es->s_first_meta_bg will get set + * to the first block group that is managed using meta_bg and s_first_meta_bg + * must be a multiple of EXT4_DESC_PER_BLOCK(sb). + * This function will be called when first group of meta_bg is added to bring + * new group descriptors block of new added meta_bg. */ static int add_new_gdb_meta_bg(struct super_block *sb, handle_t *handle, ext4_group_t group) { @@ -965,8 +967,8 @@ static int add_new_gdb_meta_bg(struct super_block *sb, unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); int err; - gdblock = ext4_meta_bg_first_block_no(sb, group) + - ext4_bg_has_super(sb, group); + gdblock = ext4_group_first_block_no(sb, group) + + ext4_bg_has_super(sb, group); gdb_bh = ext4_sb_bread(sb, gdblock, 0); if (IS_ERR(gdb_bh)) return PTR_ERR(gdb_bh); @@ -1090,9 +1092,6 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, for (i = 0; i < reserved_gdb; i++) { int err2; data = (__le32 *)primary[i]->b_data; - /* printk("reserving backup %lu[%u] = %lu\n", - primary[i]->b_blocknr, gdbackups, - blk + primary[i]->b_blocknr); */ data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]); if (!err) @@ -1322,8 +1321,7 @@ static int ext4_set_bitmap_checksums(struct super_block *sb, bh = ext4_get_bitmap(sb, group_data->inode_bitmap); if (!bh) return -EIO; - ext4_inode_bitmap_csum_set(sb, gdp, bh, - EXT4_INODES_PER_GROUP(sb) / 8); + ext4_inode_bitmap_csum_set(sb, gdp, bh); brelse(bh); bh = ext4_get_bitmap(sb, group_data->block_bitmap); @@ -1609,7 +1607,6 @@ exit_journal: gdb_num >= le32_to_cpu(es->s_first_meta_bg); sector_t padding_blocks = meta_bg ? 0 : sbi->s_sbh->b_blocknr - ext4_group_first_block_no(sb, 0); - sector_t old_gdb = 0; update_backups(sb, ext4_group_first_block_no(sb, 0), (char *)es, sizeof(struct ext4_super_block), 0); @@ -1618,11 +1615,8 @@ exit_journal: gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, gdb_num); - if (old_gdb == gdb_bh->b_blocknr) - continue; update_backups(sb, gdb_bh->b_blocknr - padding_blocks, gdb_bh->b_data, gdb_bh->b_size, meta_bg); - old_gdb = gdb_bh->b_blocknr; } } exit: @@ -2092,7 +2086,7 @@ retry: } } - if ((!resize_inode && !meta_bg) || n_blocks_count == o_blocks_count) { + if ((!resize_inode && !meta_bg && n_desc_blocks > o_desc_blocks) || n_blocks_count == o_blocks_count) { err = ext4_convert_meta_bg(sb, resize_inode); if (err) goto out; @@ -2149,7 +2143,7 @@ retry: if (err) goto out; - flex_gd = alloc_flex_gd(flexbg_size); + flex_gd = alloc_flex_gd(flexbg_size, o_group, n_group); if (flex_gd == NULL) { err = -ENOMEM; goto out; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2346ef071b24..722ac723f49b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -251,18 +251,25 @@ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb, struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, blk_opf_t op_flags) { - return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE); + gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping, + ~__GFP_FS) | __GFP_MOVABLE; + + return __ext4_sb_bread_gfp(sb, block, op_flags, gfp); } struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block) { - return __ext4_sb_bread_gfp(sb, block, 0, 0); + gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping, + ~__GFP_FS); + + return __ext4_sb_bread_gfp(sb, block, 0, gfp); } void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) { - struct buffer_head *bh = sb_getblk_gfp(sb, block, 0); + struct buffer_head *bh = bdev_getblk(sb->s_bdev, block, + sb->s_blocksize, GFP_NOWAIT | __GFP_NOWARN); if (likely(bh)) { if (trylock_buffer(bh)) @@ -492,22 +499,6 @@ static void ext4_maybe_update_superblock(struct super_block *sb) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } -/* - * The del_gendisk() function uninitializes the disk-specific data - * structures, including the bdi structure, without telling anyone - * else. Once this happens, any attempt to call mark_buffer_dirty() - * (for example, by ext4_commit_super), will cause a kernel OOPS. - * This is a kludge to prevent these oops until we can put in a proper - * hook in del_gendisk() to inform the VFS and file system layers. - */ -static int block_device_ejected(struct super_block *sb) -{ - struct inode *bd_inode = sb->s_bdev->bd_inode; - struct backing_dev_info *bdi = inode_to_bdi(bd_inode); - - return bdi->dev == NULL; -} - static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) { struct super_block *sb = journal->j_private; @@ -1344,6 +1335,9 @@ static void ext4_put_super(struct super_block *sb) ext4_group_desc_free(sbi); ext4_flex_groups_free(sbi); + + WARN_ON_ONCE(!(sbi->s_mount_state & EXT4_ERROR_FS) && + percpu_counter_sum(&sbi->s_dirtyclusters_counter)); ext4_percpu_param_destroy(sbi); #ifdef CONFIG_QUOTA for (int i = 0; i < EXT4_MAXQUOTAS; i++) @@ -1360,14 +1354,14 @@ static void ext4_put_super(struct super_block *sb) sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); - if (sbi->s_journal_bdev) { + if (sbi->s_journal_bdev_file) { /* * Invalidate the journal device's buffers. We don't want them * floating about in memory - the physical journal device may * hotswapped, and it breaks the `ro-after' testing code. */ - sync_blockdev(sbi->s_journal_bdev); - invalidate_bdev(sbi->s_journal_bdev); + sync_blockdev(file_bdev(sbi->s_journal_bdev_file)); + invalidate_bdev(file_bdev(sbi->s_journal_bdev_file)); } ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); @@ -1474,7 +1468,8 @@ static void ext4_destroy_inode(struct inode *inode) dump_stack(); } - if (EXT4_I(inode)->i_reserved_data_blocks) + if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS) && + WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks)) ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!", inode->i_ino, EXT4_I(inode), @@ -1501,8 +1496,7 @@ static int __init init_inodecache(void) { ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache", sizeof(struct ext4_inode_info), 0, - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| - SLAB_ACCOUNT), + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, offsetof(struct ext4_inode_info, i_data), sizeof_field(struct ext4_inode_info, i_data), init_once); @@ -1526,7 +1520,7 @@ void ext4_clear_inode(struct inode *inode) ext4_fc_del(inode); invalidate_inode_buffers(inode); clear_inode(inode); - ext4_discard_preallocations(inode, 0); + ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); dquot_drop(inode); if (EXT4_I(inode)->jinode) { @@ -1655,6 +1649,7 @@ static const struct super_operations ext4_sops = { }; static const struct export_operations ext4_export_ops = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = ext4_fh_to_dentry, .fh_to_parent = ext4_fh_to_parent, .get_parent = ext4_get_parent, @@ -1724,10 +1719,6 @@ static const struct constant_table ext4_param_dax[] = { {} }; -/* String parameter that allows empty argument */ -#define fsparam_string_empty(NAME, OPT) \ - __fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL) - /* * Mount option specification * We don't use fsparam_flag_no because of the way we set the @@ -1742,8 +1733,8 @@ static const struct fs_parameter_spec ext4_param_specs[] = { fsparam_flag ("bsdgroups", Opt_grpid), fsparam_flag ("nogrpid", Opt_nogrpid), fsparam_flag ("sysvgroups", Opt_nogrpid), - fsparam_u32 ("resgid", Opt_resgid), - fsparam_u32 ("resuid", Opt_resuid), + fsparam_gid ("resgid", Opt_resgid), + fsparam_uid ("resuid", Opt_resuid), fsparam_u32 ("sb", Opt_sb), fsparam_enum ("errors", Opt_errors, ext4_param_errors), fsparam_flag ("nouid32", Opt_nouid32), @@ -2028,6 +2019,9 @@ int ext4_init_fs_context(struct fs_context *fc) fc->fs_private = ctx; fc->ops = &ext4_context_ops; + /* i_version is always enabled now */ + fc->sb_flags |= SB_I_VERSION; + return 0; } @@ -2079,8 +2073,7 @@ static int unnote_qf_name(struct fs_context *fc, int qtype) { struct ext4_fs_context *ctx = fc->fs_private; - if (ctx->s_qf_names[qtype]) - kfree(ctx->s_qf_names[qtype]); + kfree(ctx->s_qf_names[qtype]); ctx->s_qf_names[qtype] = NULL; ctx->qname_spec |= 1 << qtype; @@ -2149,8 +2142,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) struct fs_parse_result result; const struct mount_opts *m; int is_remount; - kuid_t uid; - kgid_t gid; int token; token = fs_parse(fc, ext4_param_specs, param, &result); @@ -2292,23 +2283,11 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) ctx->spec |= EXT4_SPEC_s_stripe; return 0; case Opt_resuid: - uid = make_kuid(current_user_ns(), result.uint_32); - if (!uid_valid(uid)) { - ext4_msg(NULL, KERN_ERR, "Invalid uid value %d", - result.uint_32); - return -EINVAL; - } - ctx->s_resuid = uid; + ctx->s_resuid = result.uid; ctx->spec |= EXT4_SPEC_s_resuid; return 0; case Opt_resgid: - gid = make_kgid(current_user_ns(), result.uint_32); - if (!gid_valid(gid)) { - ext4_msg(NULL, KERN_ERR, "Invalid gid value %d", - result.uint_32); - return -EINVAL; - } - ctx->s_resgid = gid; + ctx->s_resgid = result.gid; ctx->spec |= EXT4_SPEC_s_resgid; return 0; case Opt_journal_dev: @@ -2485,8 +2464,7 @@ static int parse_options(struct fs_context *fc, char *options) param.size = v_len; ret = ext4_parse_param(fc, ¶m); - if (param.string) - kfree(param.string); + kfree(param.string); if (ret < 0) return ret; } @@ -2793,15 +2771,6 @@ static int ext4_check_opt_consistency(struct fs_context *fc, return -EINVAL; } - if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DIOREAD_NOLOCK)) { - int blocksize = - BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); - if (blocksize < PAGE_SIZE) - ext4_msg(NULL, KERN_WARNING, "Warning: mounting with an " - "experimental mount option 'dioread_nolock' " - "for blocksize < PAGE_SIZE"); - } - err = ext4_check_test_dummy_encryption(fc, sb); if (err) return err; @@ -2821,6 +2790,13 @@ static int ext4_check_opt_consistency(struct fs_context *fc, } if (is_remount) { + if (!sbi->s_journal && + ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT)) { + ext4_msg(NULL, KERN_WARNING, + "Remounting fs w/o journal so ignoring data_err option"); + ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT); + } + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { ext4_msg(NULL, KERN_ERR, "can't mount with " @@ -3087,7 +3063,7 @@ int ext4_seq_options_show(struct seq_file *seq, void *offset) seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw"); rc = _ext4_show_options(seq, sb, 1); - seq_puts(seq, "\n"); + seq_putc(seq, '\n'); return rc; } @@ -3618,14 +3594,12 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly) return 0; } -#if !IS_ENABLED(CONFIG_UNICODE) - if (ext4_has_feature_casefold(sb)) { + if (!IS_ENABLED(CONFIG_UNICODE) && ext4_has_feature_casefold(sb)) { ext4_msg(sb, KERN_ERR, "Filesystem with casefold feature cannot be " "mounted without CONFIG_UNICODE"); return 0; } -#endif if (readonly) return 1; @@ -4242,7 +4216,7 @@ int ext4_calculate_overhead(struct super_block *sb) * Add the internal journal blocks whether the journal has been * loaded or not */ - if (sbi->s_journal && !sbi->s_journal_bdev) + if (sbi->s_journal && !sbi->s_journal_bdev_file) overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len); else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { /* j_inum for internal journal is non-zero */ @@ -4410,7 +4384,7 @@ static void ext4_set_def_opts(struct super_block *sb, ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sb, DELALLOC); - if (sb->s_blocksize == PAGE_SIZE) + if (sb->s_blocksize <= PAGE_SIZE) set_opt(sb, DIOREAD_NOLOCK); } @@ -4431,22 +4405,6 @@ static int ext4_handle_clustersize(struct super_block *sb) } sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - le32_to_cpu(es->s_log_block_size); - sbi->s_clusters_per_group = - le32_to_cpu(es->s_clusters_per_group); - if (sbi->s_clusters_per_group > sb->s_blocksize * 8) { - ext4_msg(sb, KERN_ERR, - "#clusters per group too big: %lu", - sbi->s_clusters_per_group); - return -EINVAL; - } - if (sbi->s_blocks_per_group != - (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) { - ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and " - "clusters per group (%lu) inconsistent", - sbi->s_blocks_per_group, - sbi->s_clusters_per_group); - return -EINVAL; - } } else { if (clustersize != sb->s_blocksize) { ext4_msg(sb, KERN_ERR, @@ -4460,9 +4418,21 @@ static int ext4_handle_clustersize(struct super_block *sb) sbi->s_blocks_per_group); return -EINVAL; } - sbi->s_clusters_per_group = sbi->s_blocks_per_group; sbi->s_cluster_bits = 0; } + sbi->s_clusters_per_group = le32_to_cpu(es->s_clusters_per_group); + if (sbi->s_clusters_per_group > sb->s_blocksize * 8) { + ext4_msg(sb, KERN_ERR, "#clusters per group too big: %lu", + sbi->s_clusters_per_group); + return -EINVAL; + } + if (sbi->s_blocks_per_group != + (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) { + ext4_msg(sb, KERN_ERR, + "blocks per group (%lu) and clusters per group (%lu) inconsistent", + sbi->s_blocks_per_group, sbi->s_clusters_per_group); + return -EINVAL; + } sbi->s_cluster_ratio = clustersize / sb->s_blocksize; /* Do we have standard group size of clustersize * 8 blocks ? */ @@ -5135,16 +5105,27 @@ out: return ret; } -static void ext4_hash_info_init(struct super_block *sb) +static int ext4_hash_info_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; unsigned int i; + sbi->s_def_hash_version = es->s_def_hash_version; + + if (sbi->s_def_hash_version > DX_HASH_LAST) { + ext4_msg(sb, KERN_ERR, + "Invalid default hash set in the superblock"); + return -EINVAL; + } else if (sbi->s_def_hash_version == DX_HASH_SIPHASH) { + ext4_msg(sb, KERN_ERR, + "SIPHASH is not a valid default hash value"); + return -EINVAL; + } + for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); - sbi->s_def_hash_version = es->s_def_hash_version; if (ext4_has_feature_dir_index(sb)) { i = le32_to_cpu(es->s_flags); if (i & EXT2_FLAGS_UNSIGNED_HASH) @@ -5162,6 +5143,7 @@ static void ext4_hash_info_init(struct super_block *sb) #endif } } + return 0; } static int ext4_block_group_meta_init(struct super_block *sb, int silent) @@ -5298,9 +5280,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); - /* i_version is always enabled now */ - sb->s_flags |= SB_I_VERSION; - err = ext4_check_feature_compatibility(sb, es, silent); if (err) goto failed_mount; @@ -5309,7 +5288,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (err) goto failed_mount; - ext4_hash_info_init(sb); + err = ext4_hash_info_init(sb); + if (err) + goto failed_mount; err = ext4_handle_clustersize(sb); if (err) @@ -5361,11 +5342,14 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sb->s_qcop = &ext4_qctl_operations; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif - memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); + super_set_uuid(sb, es->s_uuid, sizeof(es->s_uuid)); + super_set_sysfs_name_bdev(sb); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); + spin_lock_init(&sbi->s_bdev_wb_lock); + ext4_fast_commit_init(sb); sb->s_root = NULL; @@ -5389,6 +5373,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) err = ext4_load_and_init_journal(sb, es, ctx); if (err) goto failed_mount3a; + if (bdev_read_only(sb->s_bdev)) + needs_recovery = 0; } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && ext4_has_feature_journal_needs_recovery(sb)) { ext4_msg(sb, KERN_ERR, "required journal recovery " @@ -5419,6 +5405,11 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) "data=, fs mounted w/o journal"); goto failed_mount3a; } + if (test_opt(sb, DATA_ERR_ABORT)) { + ext4_msg(sb, KERN_ERR, + "can't mount with data_err=abort, fs mounted w/o journal"); + goto failed_mount3a; + } sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; clear_opt(sb, JOURNAL_CHECKSUM); clear_opt(sb, DATA_FLAGS); @@ -5499,6 +5490,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount4; } + generic_set_sb_d_ops(sb); sb->s_root = d_make_root(root); if (!sb->s_root) { ext4_msg(sb, KERN_ERR, "get root dentry failed"); @@ -5586,8 +5578,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) * Save the original bdev mapping's wb_err value which could be * used to detect the metadata async write error. */ - spin_lock_init(&sbi->s_bdev_wb_lock); - errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, + errseq_check_and_advance(&sb->s_bdev->bd_mapping->wb_err, &sbi->s_bdev_wb_err); EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); @@ -5683,9 +5674,9 @@ failed_mount: #endif fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); brelse(sbi->s_sbh); - if (sbi->s_journal_bdev) { - invalidate_bdev(sbi->s_journal_bdev); - blkdev_put(sbi->s_journal_bdev, sb); + if (sbi->s_journal_bdev_file) { + invalidate_bdev(file_bdev(sbi->s_journal_bdev_file)); + bdev_fput(sbi->s_journal_bdev_file); } out_fail: invalidate_bdev(sb->s_bdev); @@ -5855,30 +5846,30 @@ static journal_t *ext4_open_inode_journal(struct super_block *sb, return journal; } -static struct block_device *ext4_get_journal_blkdev(struct super_block *sb, +static struct file *ext4_get_journal_blkdev(struct super_block *sb, dev_t j_dev, ext4_fsblk_t *j_start, ext4_fsblk_t *j_len) { struct buffer_head *bh; struct block_device *bdev; + struct file *bdev_file; int hblock, blocksize; ext4_fsblk_t sb_block; unsigned long offset; struct ext4_super_block *es; int errno; - /* see get_tree_bdev why this is needed and safe */ - up_write(&sb->s_umount); - bdev = blkdev_get_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb, - &fs_holder_ops); - down_write(&sb->s_umount); - if (IS_ERR(bdev)) { + bdev_file = bdev_file_open_by_dev(j_dev, + BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES, + sb, &fs_holder_ops); + if (IS_ERR(bdev_file)) { ext4_msg(sb, KERN_ERR, "failed to open journal device unknown-block(%u,%u) %ld", - MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev)); - return ERR_CAST(bdev); + MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_file)); + return bdev_file; } + bdev = file_bdev(bdev_file); blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { @@ -5890,7 +5881,7 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb, sb_block = EXT4_MIN_BLOCK_SIZE / blocksize; offset = EXT4_MIN_BLOCK_SIZE % blocksize; - set_blocksize(bdev, blocksize); + set_blocksize(bdev_file, blocksize); bh = __bread(bdev, sb_block, blocksize); if (!bh) { ext4_msg(sb, KERN_ERR, "couldn't read superblock of " @@ -5925,12 +5916,12 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb, *j_start = sb_block + 1; *j_len = ext4_blocks_count(es); brelse(bh); - return bdev; + return bdev_file; out_bh: brelse(bh); out_bdev: - blkdev_put(bdev, sb); + bdev_fput(bdev_file); return ERR_PTR(errno); } @@ -5940,14 +5931,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb, journal_t *journal; ext4_fsblk_t j_start; ext4_fsblk_t j_len; - struct block_device *journal_bdev; + struct file *bdev_file; int errno = 0; - journal_bdev = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len); - if (IS_ERR(journal_bdev)) - return ERR_CAST(journal_bdev); + bdev_file = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len); + if (IS_ERR(bdev_file)) + return ERR_CAST(bdev_file); - journal = jbd2_journal_init_dev(journal_bdev, sb->s_bdev, j_start, + journal = jbd2_journal_init_dev(file_bdev(bdev_file), sb->s_bdev, j_start, j_len, sb->s_blocksize); if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "failed to create device journal"); @@ -5962,14 +5953,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb, goto out_journal; } journal->j_private = sb; - EXT4_SB(sb)->s_journal_bdev = journal_bdev; + EXT4_SB(sb)->s_journal_bdev_file = bdev_file; ext4_init_journal_params(sb, journal); return journal; out_journal: jbd2_journal_destroy(journal); out_bdev: - blkdev_put(journal_bdev, sb); + bdev_fput(bdev_file); return ERR_PTR(errno); } @@ -6147,8 +6138,8 @@ static void ext4_update_super(struct super_block *sb) __ext4_update_tstamp(&es->s_first_error_time, &es->s_first_error_time_hi, sbi->s_first_error_time); - strncpy(es->s_first_error_func, sbi->s_first_error_func, - sizeof(es->s_first_error_func)); + strtomem_pad(es->s_first_error_func, + sbi->s_first_error_func, 0); es->s_first_error_line = cpu_to_le32(sbi->s_first_error_line); es->s_first_error_ino = @@ -6161,8 +6152,7 @@ static void ext4_update_super(struct super_block *sb) __ext4_update_tstamp(&es->s_last_error_time, &es->s_last_error_time_hi, sbi->s_last_error_time); - strncpy(es->s_last_error_func, sbi->s_last_error_func, - sizeof(es->s_last_error_func)); + strtomem_pad(es->s_last_error_func, sbi->s_last_error_func, 0); es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line); es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino); es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block); @@ -6189,8 +6179,6 @@ static int ext4_commit_super(struct super_block *sb) if (!sbh) return -EINVAL; - if (block_device_ejected(sb)) - return -ENODEV; ext4_update_super(sb); @@ -6770,6 +6758,7 @@ static int ext4_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; int ret; + bool old_ro = sb_rdonly(sb); fc->s_fs_info = EXT4_SB(sb); @@ -6781,9 +6770,9 @@ static int ext4_reconfigure(struct fs_context *fc) if (ret < 0) return ret; - ext4_msg(sb, KERN_INFO, "re-mounted %pU %s. Quota mode: %s.", - &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w", - ext4_quota_mode(sb)); + ext4_msg(sb, KERN_INFO, "re-mounted %pU%s.", + &sb->s_uuid, + (old_ro != sb_rdonly(sb)) ? (sb_rdonly(sb) ? " ro" : " r/w") : ""); return 0; } @@ -6807,22 +6796,29 @@ static int ext4_statfs_project(struct super_block *sb, dquot->dq_dqb.dqb_bhardlimit); limit >>= sb->s_blocksize_bits; - if (limit && buf->f_blocks > limit) { + if (limit) { + uint64_t remaining = 0; + curblock = (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; - buf->f_blocks = limit; - buf->f_bfree = buf->f_bavail = - (buf->f_blocks > curblock) ? - (buf->f_blocks - curblock) : 0; + if (limit > curblock) + remaining = limit - curblock; + + buf->f_blocks = min(buf->f_blocks, limit); + buf->f_bfree = min(buf->f_bfree, remaining); + buf->f_bavail = min(buf->f_bavail, remaining); } limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, dquot->dq_dqb.dqb_ihardlimit); - if (limit && buf->f_files > limit) { - buf->f_files = limit; - buf->f_ffree = - (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? - (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; + if (limit) { + uint64_t remaining = 0; + + if (limit > dquot->dq_dqb.dqb_curinodes) + remaining = limit - dquot->dq_dqb.dqb_curinodes; + + buf->f_files = min(buf->f_files, limit); + buf->f_ffree = min(buf->f_ffree, remaining); } spin_unlock(&dquot->dq_dqb_lock); @@ -6925,12 +6921,25 @@ static int ext4_release_dquot(struct dquot *dquot) { int ret, err; handle_t *handle; + bool freeze_protected = false; + + /* + * Trying to sb_start_intwrite() in a running transaction + * can result in a deadlock. Further, running transactions + * are already protected from freezing. + */ + if (!ext4_journal_current_handle()) { + sb_start_intwrite(dquot->dq_sb); + freeze_protected = true; + } handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) { /* Release dquot anyway to avoid endless cycle in dqput() */ dquot_release(dquot); + if (freeze_protected) + sb_end_intwrite(dquot->dq_sb); return PTR_ERR(handle); } ret = dquot_release(dquot); @@ -6941,6 +6950,10 @@ static int ext4_release_dquot(struct dquot *dquot) err = ext4_journal_stop(handle); if (!ret) ret = err; + + if (freeze_protected) + sb_end_intwrite(dquot->dq_sb); + return ret; } @@ -7180,7 +7193,7 @@ static int ext4_quota_off(struct super_block *sb, int type) } EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL); inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); out_unlock: @@ -7353,12 +7366,12 @@ static inline int ext3_feature_set_ok(struct super_block *sb) static void ext4_kill_sb(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); - struct block_device *journal_bdev = sbi ? sbi->s_journal_bdev : NULL; + struct file *bdev_file = sbi ? sbi->s_journal_bdev_file : NULL; kill_block_super(sb); - if (journal_bdev) - blkdev_put(journal_bdev, sb); + if (bdev_file) + bdev_fput(bdev_file); } static struct file_system_type ext4_fs_type = { diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 75bf1f88843c..645240cc0229 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -92,10 +92,12 @@ static const char *ext4_get_link(struct dentry *dentry, struct inode *inode, if (!dentry) { bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT); - if (IS_ERR(bh)) - return ERR_CAST(bh); - if (!bh || !ext4_buffer_uptodate(bh)) + if (IS_ERR(bh) || !bh) return ERR_PTR(-ECHILD); + if (!ext4_buffer_uptodate(bh)) { + brelse(bh); + return ERR_PTR(-ECHILD); + } } else { bh = ext4_bread(NULL, inode, 0, 0); if (IS_ERR(bh)) diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index d65dccb44ed5..ddb54608ca2e 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -30,7 +30,9 @@ typedef enum { attr_first_error_time, attr_last_error_time, attr_clusters_in_group, + attr_mb_order, attr_feature, + attr_pointer_pi, attr_pointer_ui, attr_pointer_ul, attr_pointer_u64, @@ -179,6 +181,9 @@ static struct ext4_attr ext4_attr_##_name = { \ #define EXT4_RO_ATTR_ES_STRING(_name,_elname,_size) \ EXT4_ATTR_STRING(_name, 0444, _size, ext4_super_block, _elname) +#define EXT4_RW_ATTR_SBI_PI(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0644, pointer_pi, ext4_sb_info, _elname) + #define EXT4_RW_ATTR_SBI_UI(_name,_elname) \ EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname) @@ -210,6 +215,8 @@ EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead, ext4_sb_info, s_inode_readahead_blks); EXT4_ATTR_OFFSET(mb_group_prealloc, 0644, clusters_in_group, ext4_sb_info, s_mb_group_prealloc); +EXT4_ATTR_OFFSET(mb_best_avail_max_trim_order, 0644, mb_order, + ext4_sb_info, s_mb_best_avail_max_trim_order); EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); @@ -219,13 +226,12 @@ EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); -EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); -EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst); -EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval); -EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); -EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); -EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); -EXT4_RW_ATTR_SBI_UI(mb_best_avail_max_trim_order, s_mb_best_avail_max_trim_order); +EXT4_RW_ATTR_SBI_PI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_PI(err_ratelimit_burst, s_err_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_PI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_PI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_PI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_PI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); #ifdef CONFIG_EXT4_DEBUG EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail); #endif @@ -368,13 +374,45 @@ static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi) #define print_tstamp(buf, es, tstamp) \ __print_tstamp(buf, (es)->tstamp, (es)->tstamp ## _hi) +static ssize_t ext4_generic_attr_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + void *ptr = calc_ptr(a, sbi); + + if (!ptr) + return 0; + + switch (a->attr_id) { + case attr_inode_readahead: + case attr_clusters_in_group: + case attr_mb_order: + case attr_pointer_pi: + case attr_pointer_ui: + if (a->attr_ptr == ptr_ext4_super_block_offset) + return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr)); + return sysfs_emit(buf, "%u\n", *((unsigned int *) ptr)); + case attr_pointer_ul: + return sysfs_emit(buf, "%lu\n", *((unsigned long *) ptr)); + case attr_pointer_u8: + return sysfs_emit(buf, "%u\n", *((unsigned char *) ptr)); + case attr_pointer_u64: + if (a->attr_ptr == ptr_ext4_super_block_offset) + return sysfs_emit(buf, "%llu\n", le64_to_cpup(ptr)); + return sysfs_emit(buf, "%llu\n", *((unsigned long long *) ptr)); + case attr_pointer_string: + return sysfs_emit(buf, "%.*s\n", a->attr_size, (char *) ptr); + case attr_pointer_atomic: + return sysfs_emit(buf, "%d\n", atomic_read((atomic_t *) ptr)); + } + return 0; +} + static ssize_t ext4_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, s_kobj); struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); - void *ptr = calc_ptr(a, sbi); switch (a->attr_id) { case attr_delayed_allocation_blocks: @@ -393,46 +431,6 @@ static ssize_t ext4_attr_show(struct kobject *kobj, return sysfs_emit(buf, "%llu\n", (unsigned long long) percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit)); - case attr_inode_readahead: - case attr_clusters_in_group: - case attr_pointer_ui: - if (!ptr) - return 0; - if (a->attr_ptr == ptr_ext4_super_block_offset) - return sysfs_emit(buf, "%u\n", - le32_to_cpup(ptr)); - else - return sysfs_emit(buf, "%u\n", - *((unsigned int *) ptr)); - case attr_pointer_ul: - if (!ptr) - return 0; - return sysfs_emit(buf, "%lu\n", - *((unsigned long *) ptr)); - case attr_pointer_u8: - if (!ptr) - return 0; - return sysfs_emit(buf, "%u\n", - *((unsigned char *) ptr)); - case attr_pointer_u64: - if (!ptr) - return 0; - if (a->attr_ptr == ptr_ext4_super_block_offset) - return sysfs_emit(buf, "%llu\n", - le64_to_cpup(ptr)); - else - return sysfs_emit(buf, "%llu\n", - *((unsigned long long *) ptr)); - case attr_pointer_string: - if (!ptr) - return 0; - return sysfs_emit(buf, "%.*s\n", a->attr_size, - (char *) ptr); - case attr_pointer_atomic: - if (!ptr) - return 0; - return sysfs_emit(buf, "%d\n", - atomic_read((atomic_t *) ptr)); case attr_feature: return sysfs_emit(buf, "supported\n"); case attr_first_error_time: @@ -441,29 +439,33 @@ static ssize_t ext4_attr_show(struct kobject *kobj, return print_tstamp(buf, sbi->s_es, s_last_error_time); case attr_journal_task: return journal_task_show(sbi, buf); + default: + return ext4_generic_attr_show(a, sbi, buf); } - - return 0; } -static ssize_t ext4_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t len) +static ssize_t ext4_generic_attr_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t len) { - struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, - s_kobj); - struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); - void *ptr = calc_ptr(a, sbi); + int ret; unsigned int t; unsigned long lt; - int ret; + void *ptr = calc_ptr(a, sbi); + + if (!ptr) + return 0; switch (a->attr_id) { - case attr_reserved_clusters: - return reserved_clusters_store(sbi, buf, len); + case attr_pointer_pi: + ret = kstrtouint(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if ((int)t < 0) + return -EINVAL; + *((unsigned int *) ptr) = t; + return len; case attr_pointer_ui: - if (!ptr) - return 0; ret = kstrtouint(skip_spaces(buf), 0, &t); if (ret) return ret; @@ -472,9 +474,15 @@ static ssize_t ext4_attr_store(struct kobject *kobj, else *((unsigned int *) ptr) = t; return len; + case attr_mb_order: + ret = kstrtouint(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if (t > 64) + return -EINVAL; + *((unsigned int *) ptr) = t; + return len; case attr_clusters_in_group: - if (!ptr) - return 0; ret = kstrtouint(skip_spaces(buf), 0, &t); if (ret) return ret; @@ -483,19 +491,33 @@ static ssize_t ext4_attr_store(struct kobject *kobj, *((unsigned int *) ptr) = t; return len; case attr_pointer_ul: - if (!ptr) - return 0; ret = kstrtoul(skip_spaces(buf), 0, <); if (ret) return ret; *((unsigned long *) ptr) = lt; return len; + } + return 0; +} + +static ssize_t ext4_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); + + switch (a->attr_id) { + case attr_reserved_clusters: + return reserved_clusters_store(sbi, buf, len); case attr_inode_readahead: return inode_readahead_blks_store(sbi, buf, len); case attr_trigger_test_error: return trigger_test_error(sbi, buf, len); + default: + return ext4_generic_attr_store(a, sbi, buf, len); } - return 0; } static void ext4_sb_release(struct kobject *kobj) diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index 2f37e1ea3955..d9203228ce97 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -76,17 +76,17 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, while (count) { size_t n = min_t(size_t, count, PAGE_SIZE - offset_in_page(pos)); - struct page *page; + struct folio *folio; void *fsdata = NULL; int res; - res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata); + res = aops->write_begin(NULL, mapping, pos, n, &folio, &fsdata); if (res) return res; - memcpy_to_page(page, offset_in_page(pos), buf, n); + memcpy_to_folio(folio, offset_in_folio(folio, pos), buf, n); - res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata); + res = aops->write_end(NULL, mapping, pos, n, n, folio, fsdata); if (res < 0) return res; if (res != n) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index f40785bc4e55..6ff94cdf1515 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -98,7 +98,7 @@ static const struct xattr_handler * const ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_HURD] = &ext4_xattr_hurd_handler, }; -const struct xattr_handler *ext4_xattr_handlers[] = { +const struct xattr_handler * const ext4_xattr_handlers[] = { &ext4_xattr_user_handler, &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_SECURITY @@ -356,7 +356,7 @@ ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) { - return ((u64) inode_get_ctime(ea_inode).tv_sec << 32) | + return ((u64) inode_get_ctime_sec(ea_inode) << 32) | (u32) inode_peek_iversion_raw(ea_inode); } @@ -368,12 +368,12 @@ static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count) static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode) { - return (u32)ea_inode->i_atime.tv_sec; + return (u32) inode_get_atime_sec(ea_inode); } static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash) { - ea_inode->i_atime.tv_sec = hash; + inode_set_atime(ea_inode, hash, 0); } /* @@ -418,7 +418,7 @@ free_bhs: return ret; } -#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec) +#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode_get_mtime_sec(inode))) static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, u32 ea_inode_hash, struct inode **ea_inode) @@ -1176,15 +1176,24 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, { struct inode *ea_inode; struct ext4_xattr_entry *entry; + struct ext4_iloc iloc; bool dirty = false; unsigned int ea_ino; int err; int credits; + void *end; + + if (block_csum) + end = (void *)bh->b_data + bh->b_size; + else { + ext4_get_inode_loc(parent, &iloc); + end = (void *)ext4_raw_inode(&iloc) + EXT4_SB(parent->i_sb)->s_inode_size; + } /* One credit for dec ref on ea_inode, one for orphan list addition, */ credits = 2 + extra_credits; - for (entry = first; !IS_LAST_ENTRY(entry); + for (entry = first; (void *)entry < end && !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; @@ -2034,8 +2043,13 @@ clone_block: inserted: if (!IS_LAST_ENTRY(s->first)) { - new_bh = ext4_xattr_block_cache_find(inode, header(s->base), - &ce); + new_bh = ext4_xattr_block_cache_find(inode, header(s->base), &ce); + if (IS_ERR(new_bh)) { + error = PTR_ERR(new_bh); + new_bh = NULL; + goto cleanup; + } + if (new_bh) { /* We found an identical block in the cache. */ if (new_bh == bs->bh) @@ -2875,33 +2889,31 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, if (*ea_inode_array == NULL) { /* * Start with 15 inodes, so it fits into a power-of-two size. - * If *ea_inode_array is NULL, this is essentially offsetof() */ - (*ea_inode_array) = - kmalloc(offsetof(struct ext4_xattr_inode_array, - inodes[EIA_MASK]), - GFP_NOFS); + (*ea_inode_array) = kmalloc( + struct_size(*ea_inode_array, inodes, EIA_MASK), + GFP_NOFS); if (*ea_inode_array == NULL) return -ENOMEM; (*ea_inode_array)->count = 0; } else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) { /* expand the array once all 15 + n * 16 slots are full */ struct ext4_xattr_inode_array *new_array = NULL; - int count = (*ea_inode_array)->count; - /* if new_array is NULL, this is essentially offsetof() */ new_array = kmalloc( - offsetof(struct ext4_xattr_inode_array, - inodes[count + EIA_INCR]), - GFP_NOFS); + struct_size(*ea_inode_array, inodes, + (*ea_inode_array)->count + EIA_INCR), + GFP_NOFS); if (new_array == NULL) return -ENOMEM; memcpy(new_array, *ea_inode_array, - offsetof(struct ext4_xattr_inode_array, inodes[count])); + struct_size(*ea_inode_array, inodes, + (*ea_inode_array)->count)); kfree(*ea_inode_array); *ea_inode_array = new_array; } - (*ea_inode_array)->inodes[(*ea_inode_array)->count++] = inode; + (*ea_inode_array)->count++; + (*ea_inode_array)->inodes[(*ea_inode_array)->count - 1] = inode; return 0; } @@ -3032,8 +3044,6 @@ void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array) * * Create a new entry in the extended attribute block cache, and insert * it unless such an entry is already in the cache. - * - * Returns 0, or a negative error number on failure. */ static void ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache, @@ -3061,8 +3071,7 @@ ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache, * * Compare two extended attribute blocks for equality. * - * Returns 0 if the blocks are equal, 1 if they differ, and - * a negative error number on errors. + * Returns 0 if the blocks are equal, 1 if they differ. */ static int ext4_xattr_cmp(struct ext4_xattr_header *header1, @@ -3101,8 +3110,8 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, * * Find an identical extended attribute block. * - * Returns a pointer to the block found, or NULL if such a block was - * not found or an error occurred. + * Returns a pointer to the block found, or NULL if such a block was not + * found, or an error pointer if an error occurred while reading ea block. */ static struct buffer_head * ext4_xattr_block_cache_find(struct inode *inode, @@ -3124,13 +3133,11 @@ ext4_xattr_block_cache_find(struct inode *inode, bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO); if (IS_ERR(bh)) { - if (PTR_ERR(bh) == -ENOMEM) { - mb_cache_entry_put(ea_block_cache, ce); - return NULL; - } - bh = NULL; - EXT4_ERROR_INODE(inode, "block %lu read error", - (unsigned long)ce->e_value); + if (PTR_ERR(bh) != -ENOMEM) + EXT4_ERROR_INODE(inode, "block %lu read error", + (unsigned long)ce->e_value); + mb_cache_entry_put(ea_block_cache, ce); + return bh; } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { *pce = ce; return bh; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 824faf0b15a8..b25c2d7b5f99 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -32,8 +32,7 @@ struct ext4_xattr_header { __le32 h_refcount; /* reference count */ __le32 h_blocks; /* number of disk blocks used */ __le32 h_hash; /* hash value of all attributes */ - __le32 h_checksum; /* crc32c(uuid+id+xattrblock) */ - /* id = inum if refcount=1, blknum otherwise */ + __le32 h_checksum; /* crc32c(uuid+blknum+xattrblock) */ __u32 h_reserved[3]; /* zero right now */ }; @@ -130,8 +129,8 @@ struct ext4_xattr_ibody_find { }; struct ext4_xattr_inode_array { - unsigned int count; /* # of used items in the array */ - struct inode *inodes[]; + unsigned int count; + struct inode *inodes[] __counted_by(count); }; extern const struct xattr_handler ext4_xattr_user_handler; @@ -193,7 +192,7 @@ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); extern void ext4_evict_ea_inode(struct inode *inode); -extern const struct xattr_handler *ext4_xattr_handlers[]; +extern const struct xattr_handler * const ext4_xattr_handlers[]; extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is); |