summaryrefslogtreecommitdiff
path: root/fs/ext4
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/acl.h5
-rw-r--r--fs/ext4/balloc.c20
-rw-r--r--fs/ext4/bitmap.c8
-rw-r--r--fs/ext4/block_validity.c7
-rw-r--r--fs/ext4/crypto.c23
-rw-r--r--fs/ext4/dir.c55
-rw-r--r--fs/ext4/ext4.h134
-rw-r--r--fs/ext4/ext4_jbd2.c7
-rw-r--r--fs/ext4/extents.c1400
-rw-r--r--fs/ext4/extents_status.c329
-rw-r--r--fs/ext4/extents_status.h33
-rw-r--r--fs/ext4/fast_commit.c25
-rw-r--r--fs/ext4/file.c30
-rw-r--r--fs/ext4/fsmap.c32
-rw-r--r--fs/ext4/ialloc.c30
-rw-r--r--fs/ext4/indirect.c13
-rw-r--r--fs/ext4/inline.c56
-rw-r--r--fs/ext4/inode-test.c1
-rw-r--r--fs/ext4/inode.c933
-rw-r--r--fs/ext4/ioctl.c28
-rw-r--r--fs/ext4/mballoc-test.c1001
-rw-r--r--fs/ext4/mballoc.c1078
-rw-r--r--fs/ext4/mballoc.h13
-rw-r--r--fs/ext4/migrate.c5
-rw-r--r--fs/ext4/move_extent.c61
-rw-r--r--fs/ext4/namei.c163
-rw-r--r--fs/ext4/orphan.c5
-rw-r--r--fs/ext4/page-io.c20
-rw-r--r--fs/ext4/readpage.c30
-rw-r--r--fs/ext4/resize.c104
-rw-r--r--fs/ext4/super.c289
-rw-r--r--fs/ext4/symlink.c8
-rw-r--r--fs/ext4/sysfs.c156
-rw-r--r--fs/ext4/verity.c8
-rw-r--r--fs/ext4/xattr.c73
-rw-r--r--fs/ext4/xattr.h9
36 files changed, 3506 insertions, 2686 deletions
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index ef4c19e5f570..0c5a79c3b5d4 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -68,11 +68,6 @@ extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
static inline int
ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
{
- /* usually, the umask is applied by posix_acl_create(), but if
- ext4 ACL support is disabled at compile time, we need to do
- it here, because posix_acl_create() will never be called */
- inode->i_mode &= ~current_umask();
-
return 0;
}
#endif /* CONFIG_EXT4_FS_POSIX_ACL */
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 396474e9e2bf..c48fd36b2d74 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -22,6 +22,7 @@
#include "mballoc.h"
#include <trace/events/ext4.h>
+#include <kunit/static_stub.h>
static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
ext4_group_t block_group);
@@ -111,10 +112,8 @@ static unsigned ext4_num_overhead_clusters(struct super_block *sb,
itbl_blk_start = ext4_inode_table(sb, gdp);
itbl_blk_end = itbl_blk_start + sbi->s_itb_per_group - 1;
if (itbl_blk_start <= end && itbl_blk_end >= start) {
- itbl_blk_start = itbl_blk_start >= start ?
- itbl_blk_start : start;
- itbl_blk_end = itbl_blk_end <= end ?
- itbl_blk_end : end;
+ itbl_blk_start = max(itbl_blk_start, start);
+ itbl_blk_end = min(itbl_blk_end, end);
itbl_cluster_start = EXT4_B2C(sbi, itbl_blk_start - start);
itbl_cluster_end = EXT4_B2C(sbi, itbl_blk_end - start);
@@ -274,6 +273,9 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct buffer_head *bh_p;
+ KUNIT_STATIC_STUB_REDIRECT(ext4_get_group_desc,
+ sb, block_group, bh);
+
if (block_group >= ngroups) {
ext4_error(sb, "block_group >= groups_count - block_group = %u,"
" groups_count = %u", block_group, ngroups);
@@ -468,6 +470,9 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
ext4_fsblk_t bitmap_blk;
int err;
+ KUNIT_STATIC_STUB_REDIRECT(ext4_read_block_bitmap_nowait,
+ sb, block_group, ignore_locked);
+
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
return ERR_PTR(-EFSCORRUPTED);
@@ -564,6 +569,9 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
{
struct ext4_group_desc *desc;
+ KUNIT_STATIC_STUB_REDIRECT(ext4_wait_block_bitmap,
+ sb, block_group, bh);
+
if (!buffer_new(bh))
return 0;
desc = ext4_get_group_desc(sb, block_group, NULL);
@@ -641,8 +649,8 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
/* Hm, nope. Are (enough) root reserved clusters available? */
if (uid_eq(sbi->s_resuid, current_fsuid()) ||
(!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
- capable(CAP_SYS_RESOURCE) ||
- (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+ (flags & EXT4_MB_USE_ROOT_BLOCKS) ||
+ capable(CAP_SYS_RESOURCE)) {
if (free_clusters >= (nclusters + dirty_clusters +
resv_clusters))
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index cd725bebe69e..2a135075468d 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -18,15 +18,17 @@ unsigned int ext4_count_free(char *bitmap, unsigned int numchars)
int ext4_inode_bitmap_csum_verify(struct super_block *sb,
struct ext4_group_desc *gdp,
- struct buffer_head *bh, int sz)
+ struct buffer_head *bh)
{
__u32 hi;
__u32 provided, calculated;
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int sz;
if (!ext4_has_metadata_csum(sb))
return 1;
+ sz = EXT4_INODES_PER_GROUP(sb) >> 3;
provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) {
@@ -40,14 +42,16 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb,
void ext4_inode_bitmap_csum_set(struct super_block *sb,
struct ext4_group_desc *gdp,
- struct buffer_head *bh, int sz)
+ struct buffer_head *bh)
{
__u32 csum;
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int sz;
if (!ext4_has_metadata_csum(sb))
return;
+ sz = EXT4_INODES_PER_GROUP(sb) >> 3;
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END)
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 6fe3c941b565..e8c5525afc67 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -72,7 +72,7 @@ static int add_system_zone(struct ext4_system_blocks *system_blks,
{
struct ext4_system_zone *new_entry, *entry;
struct rb_node **n = &system_blks->root.rb_node, *node;
- struct rb_node *parent = NULL, *new_node = NULL;
+ struct rb_node *parent = NULL, *new_node;
while (*n) {
parent = *n;
@@ -351,10 +351,9 @@ int ext4_check_blockref(const char *function, unsigned int line,
{
__le32 *bref = p;
unsigned int blk;
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- if (ext4_has_feature_journal(inode->i_sb) &&
- (inode->i_ino ==
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
+ if (journal && inode == journal->j_inode)
return 0;
while (bref < p+max) {
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 453d4da5de52..0a056d97e640 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -31,11 +31,10 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
ext4_fname_from_fscrypt_name(fname, &name);
-#if IS_ENABLED(CONFIG_UNICODE)
err = ext4_fname_setup_ci_filename(dir, iname, fname);
if (err)
ext4_fname_free_filename(fname);
-#endif
+
return err;
}
@@ -51,11 +50,9 @@ int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry,
ext4_fname_from_fscrypt_name(fname, &name);
-#if IS_ENABLED(CONFIG_UNICODE)
err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname);
if (err)
ext4_fname_free_filename(fname);
-#endif
return err;
}
@@ -70,10 +67,7 @@ void ext4_fname_free_filename(struct ext4_filename *fname)
fname->usr_fname = NULL;
fname->disk_name.name = NULL;
-#if IS_ENABLED(CONFIG_UNICODE)
- kfree(fname->cf_name.name);
- fname->cf_name.name = NULL;
-#endif
+ ext4_fname_free_ci_filename(fname);
}
static bool uuid_is_zero(__u8 u[16])
@@ -232,19 +226,14 @@ static bool ext4_has_stable_inodes(struct super_block *sb)
return ext4_has_feature_stable_inodes(sb);
}
-static void ext4_get_ino_and_lblk_bits(struct super_block *sb,
- int *ino_bits_ret, int *lblk_bits_ret)
-{
- *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count);
- *lblk_bits_ret = 8 * sizeof(ext4_lblk_t);
-}
-
const struct fscrypt_operations ext4_cryptops = {
- .key_prefix = "ext4:",
+ .needs_bounce_pages = 1,
+ .has_32bit_inodes = 1,
+ .supports_subblock_data_units = 1,
+ .legacy_key_prefix = "ext4:",
.get_context = ext4_get_context,
.set_context = ext4_set_context,
.get_dummy_policy = ext4_get_dummy_policy,
.empty_dir = ext4_empty_dir,
.has_stable_inodes = ext4_has_stable_inodes,
- .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits,
};
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 7ea33c3fe94e..b278b5703c19 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -104,6 +104,9 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
else if (unlikely(le32_to_cpu(de->inode) >
le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
error_msg = "inode out of bounds";
+ else if (unlikely(next_offset == size && de->name_len == 1 &&
+ de->name[0] == '.'))
+ error_msg = "'.' directory cannot be the last in data block";
else
return 0;
@@ -133,6 +136,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
struct super_block *sb = inode->i_sb;
struct buffer_head *bh = NULL;
struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
+ struct dir_private_info *info = file->private_data;
err = fscrypt_prepare_readdir(inode);
if (err)
@@ -192,7 +196,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
(PAGE_SHIFT - inode->i_blkbits);
if (!ra_has_index(&file->f_ra, index))
page_cache_sync_readahead(
- sb->s_bdev->bd_inode->i_mapping,
+ sb->s_bdev->bd_mapping,
&file->f_ra, file,
index, 1);
file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT;
@@ -229,7 +233,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
* to make sure. */
- if (!inode_eq_iversion(inode, file->f_version)) {
+ if (!inode_eq_iversion(inode, info->cookie)) {
for (i = 0; i < sb->s_blocksize && i < offset; ) {
de = (struct ext4_dir_entry_2 *)
(bh->b_data + i);
@@ -249,7 +253,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
offset = i;
ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
| offset;
- file->f_version = inode_query_iversion(inode);
+ info->cookie = inode_query_iversion(inode);
}
while (ctx->pos < inode->i_size
@@ -392,6 +396,7 @@ static inline loff_t ext4_get_htree_eof(struct file *filp)
static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
+ struct dir_private_info *info = file->private_data;
int dx_dir = is_dx_dir(inode);
loff_t ret, htree_max = ext4_get_htree_eof(file);
@@ -400,7 +405,7 @@ static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
htree_max, htree_max);
else
ret = ext4_llseek(file, offset, whence);
- file->f_version = inode_peek_iversion(inode) - 1;
+ info->cookie = inode_peek_iversion(inode) - 1;
return ret;
}
@@ -437,18 +442,15 @@ static void free_rb_tree_fname(struct rb_root *root)
*root = RB_ROOT;
}
-
-static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
- loff_t pos)
+static void ext4_htree_init_dir_info(struct file *filp, loff_t pos)
{
- struct dir_private_info *p;
-
- p = kzalloc(sizeof(*p), GFP_KERNEL);
- if (!p)
- return NULL;
- p->curr_hash = pos2maj_hash(filp, pos);
- p->curr_minor_hash = pos2min_hash(filp, pos);
- return p;
+ struct dir_private_info *p = filp->private_data;
+
+ if (is_dx_dir(file_inode(filp)) && !p->initialized) {
+ p->curr_hash = pos2maj_hash(filp, pos);
+ p->curr_minor_hash = pos2min_hash(filp, pos);
+ p->initialized = true;
+ }
}
void ext4_htree_free_dir_info(struct dir_private_info *p)
@@ -560,12 +562,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
struct fname *fname;
int ret = 0;
- if (!info) {
- info = ext4_htree_create_dir_info(file, ctx->pos);
- if (!info)
- return -ENOMEM;
- file->private_data = info;
- }
+ ext4_htree_init_dir_info(file, ctx->pos);
if (ctx->pos == ext4_get_htree_eof(file))
return 0; /* EOF */
@@ -598,10 +595,10 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
* cached entries.
*/
if ((!info->curr_node) ||
- !inode_eq_iversion(inode, file->f_version)) {
+ !inode_eq_iversion(inode, info->cookie)) {
info->curr_node = NULL;
free_rb_tree_fname(&info->root);
- file->f_version = inode_query_iversion(inode);
+ info->cookie = inode_query_iversion(inode);
ret = ext4_htree_fill_tree(file, info->curr_hash,
info->curr_minor_hash,
&info->next_hash);
@@ -672,7 +669,19 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
return 0;
}
+static int ext4_dir_open(struct inode *inode, struct file *file)
+{
+ struct dir_private_info *info;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+ file->private_data = info;
+ return 0;
+}
+
const struct file_operations ext4_dir_operations = {
+ .open = ext4_dir_open,
.llseek = ext4_dir_llseek,
.read = generic_read_dir,
.iterate_shared = ext4_readdir,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3db01b933c3e..a95525bfb99c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -213,11 +213,14 @@ enum criteria {
#define EXT4_MB_USE_RESERVED 0x2000
/* Do strict check for free blocks while retrying block allocation */
#define EXT4_MB_STRICT_CHECK 0x4000
-/* Large fragment size list lookup succeeded at least once for cr = 0 */
+/* Large fragment size list lookup succeeded at least once for
+ * CR_POWER2_ALIGNED */
#define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000
-/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */
+/* Avg fragment size rb tree lookup succeeded at least once for
+ * CR_GOAL_LEN_FAST */
#define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000
-/* Avg fragment size rb tree lookup succeeded at least once for cr = 1.5 */
+/* Avg fragment size rb tree lookup succeeded at least once for
+ * CR_BEST_AVAIL_LEN */
#define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000
struct ext4_allocation_request {
@@ -252,8 +255,10 @@ struct ext4_allocation_request {
#define EXT4_MAP_MAPPED BIT(BH_Mapped)
#define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten)
#define EXT4_MAP_BOUNDARY BIT(BH_Boundary)
+#define EXT4_MAP_DELAYED BIT(BH_Delay)
#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
- EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY)
+ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
+ EXT4_MAP_DELAYED)
struct ext4_map_blocks {
ext4_fsblk_t m_pblk;
@@ -273,7 +278,8 @@ struct ext4_system_blocks {
/*
* Flags for ext4_io_end->flags
*/
-#define EXT4_IO_END_UNWRITTEN 0x0001
+#define EXT4_IO_END_UNWRITTEN 0x0001
+#define EXT4_IO_END_FAILED 0x0002
struct ext4_io_end_vec {
struct list_head list; /* list of io_end_vec */
@@ -362,6 +368,8 @@ struct ext4_io_submit {
#define EXT4_MAX_BLOCKS(size, offset, blkbits) \
((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \
blkbits))
+#define EXT4_B_TO_LBLK(inode, offset) \
+ (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits)
/* Translate a block number to a cluster number */
#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits)
@@ -891,10 +899,13 @@ do { \
(raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX)); \
} while (0)
-#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
- EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, (inode)->xtime)
+#define EXT4_INODE_SET_ATIME(inode, raw_inode) \
+ EXT4_INODE_SET_XTIME_VAL(i_atime, inode, raw_inode, inode_get_atime(inode))
-#define EXT4_INODE_SET_CTIME(inode, raw_inode) \
+#define EXT4_INODE_SET_MTIME(inode, raw_inode) \
+ EXT4_INODE_SET_XTIME_VAL(i_mtime, inode, raw_inode, inode_get_mtime(inode))
+
+#define EXT4_INODE_SET_CTIME(inode, raw_inode) \
EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode))
#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \
@@ -910,9 +921,16 @@ do { \
.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime) \
})
-#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \
+#define EXT4_INODE_GET_ATIME(inode, raw_inode) \
do { \
- (inode)->xtime = EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode); \
+ inode_set_atime_to_ts(inode, \
+ EXT4_INODE_GET_XTIME_VAL(i_atime, inode, raw_inode)); \
+} while (0)
+
+#define EXT4_INODE_GET_MTIME(inode, raw_inode) \
+do { \
+ inode_set_mtime_to_ts(inode, \
+ EXT4_INODE_GET_XTIME_VAL(i_mtime, inode, raw_inode)); \
} while (0)
#define EXT4_INODE_GET_CTIME(inode, raw_inode) \
@@ -1043,6 +1061,7 @@ struct ext4_inode_info {
/* Number of ongoing updates on this inode */
atomic_t i_fc_updates;
+ atomic_t i_unwritten; /* Nr. of inflight conversions pending */
/* Fast commit wait queue for this inode */
wait_queue_head_t i_fc_wait;
@@ -1091,6 +1110,10 @@ struct ext4_inode_info {
/* mballoc */
atomic_t i_prealloc_active;
+
+ /* allocation reservation info for delalloc */
+ /* In case of bigalloc, this refer to clusters rather than blocks */
+ unsigned int i_reserved_data_blocks;
struct rb_root i_prealloc_node;
rwlock_t i_prealloc_lock;
@@ -1107,10 +1130,6 @@ struct ext4_inode_info {
/* ialloc */
ext4_group_t i_last_alloc_group;
- /* allocation reservation info for delalloc */
- /* In case of bigalloc, this refer to clusters rather than blocks */
- unsigned int i_reserved_data_blocks;
-
/* pending cluster reservations for bigalloc file systems */
struct ext4_pending_tree i_pending_tree;
@@ -1134,7 +1153,6 @@ struct ext4_inode_info {
*/
struct list_head i_rsv_conversion_list;
struct work_struct i_rsv_conversion_work;
- atomic_t i_unwritten; /* Nr. of inflight conversions pending */
spinlock_t i_block_reservation_lock;
@@ -1332,7 +1350,7 @@ struct ext4_super_block {
/*60*/ __le32 s_feature_incompat; /* incompatible feature set */
__le32 s_feature_ro_compat; /* readonly-compatible feature set */
/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */
-/*78*/ char s_volume_name[EXT4_LABEL_MAX]; /* volume name */
+/*78*/ char s_volume_name[EXT4_LABEL_MAX] __nonstring; /* volume name */
/*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */
/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */
/*
@@ -1494,6 +1512,7 @@ struct ext4_sb_info {
loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
struct buffer_head * s_sbh; /* Buffer containing the super block */
struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
+ /* Array of bh's for the block group descriptors */
struct buffer_head * __rcu *s_group_desc;
unsigned int s_mount_opt;
unsigned int s_mount_opt2;
@@ -1537,7 +1556,7 @@ struct ext4_sb_info {
unsigned long s_commit_interval;
u32 s_max_batch_time;
u32 s_min_batch_time;
- struct block_device *s_journal_bdev;
+ struct file *s_journal_bdev_file;
#ifdef CONFIG_QUOTA
/* Names of quota files with journalled quota */
char __rcu *s_qf_names[EXT4_MAXQUOTAS];
@@ -1564,7 +1583,7 @@ struct ext4_sb_info {
unsigned int *s_mb_maxs;
unsigned int s_group_info_size;
unsigned int s_mb_free_pending;
- struct list_head s_freed_data_list; /* List of blocks to be freed
+ struct list_head s_freed_data_list[2]; /* List of blocks to be freed
after commit completed */
struct list_head s_discard_list;
struct work_struct s_discard_work;
@@ -1653,7 +1672,7 @@ struct ext4_sb_info {
__u32 s_csum_seed;
/* Reclaim extents from extent status tree */
- struct shrinker s_es_shrinker;
+ struct shrinker *s_es_shrinker;
struct list_head s_es_list; /* List of inodes with reclaimable extents */
long s_es_nr_inode;
struct ext4_es_stats s_es_stats;
@@ -2314,9 +2333,9 @@ struct ext4_dir_entry_2 {
((struct ext4_dir_entry_hash *) \
(((void *)(entry)) + \
((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND)))
-#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(de)->hash)
+#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(entry)->hash)
#define EXT4_DIRENT_MINOR_HASH(entry) \
- le32_to_cpu(EXT4_DIRENT_HASHES(de)->minor_hash)
+ le32_to_cpu(EXT4_DIRENT_HASHES(entry)->minor_hash)
static inline bool ext4_hash_in_dirent(const struct inode *inode)
{
@@ -2438,6 +2457,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
#define DX_HASH_HALF_MD4_UNSIGNED 4
#define DX_HASH_TEA_UNSIGNED 5
#define DX_HASH_SIPHASH 6
+#define DX_HASH_LAST DX_HASH_SIPHASH
static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
const void *address, unsigned int length)
@@ -2487,7 +2507,7 @@ struct ext4_filename {
struct fscrypt_str crypto_buf;
#endif
#if IS_ENABLED(CONFIG_UNICODE)
- struct fscrypt_str cf_name;
+ struct qstr cf_name;
#endif
};
@@ -2529,6 +2549,8 @@ struct dir_private_info {
__u32 curr_hash;
__u32 curr_minor_hash;
__u32 next_hash;
+ u64 cookie;
+ bool initialized;
};
/* calculate the first block number of the group */
@@ -2669,10 +2691,10 @@ struct mmpd_data {
extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
void ext4_inode_bitmap_csum_set(struct super_block *sb,
struct ext4_group_desc *gdp,
- struct buffer_head *bh, int sz);
+ struct buffer_head *bh);
int ext4_inode_bitmap_csum_verify(struct super_block *sb,
struct ext4_group_desc *gdp,
- struct buffer_head *bh, int sz);
+ struct buffer_head *bh);
void ext4_block_bitmap_csum_set(struct super_block *sb,
struct ext4_group_desc *gdp,
struct buffer_head *bh);
@@ -2721,8 +2743,25 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
#if IS_ENABLED(CONFIG_UNICODE)
extern int ext4_fname_setup_ci_filename(struct inode *dir,
- const struct qstr *iname,
- struct ext4_filename *fname);
+ const struct qstr *iname,
+ struct ext4_filename *fname);
+
+static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname)
+{
+ kfree(fname->cf_name.name);
+ fname->cf_name.name = NULL;
+}
+#else
+static inline int ext4_fname_setup_ci_filename(struct inode *dir,
+ const struct qstr *iname,
+ struct ext4_filename *fname)
+{
+ return 0;
+}
+
+static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname)
+{
+}
#endif
/* ext4 encryption related stuff goes here crypto.c */
@@ -2745,16 +2784,11 @@ static inline int ext4_fname_setup_filename(struct inode *dir,
int lookup,
struct ext4_filename *fname)
{
- int err = 0;
fname->usr_fname = iname;
fname->disk_name.name = (unsigned char *) iname->name;
fname->disk_name.len = iname->len;
-#if IS_ENABLED(CONFIG_UNICODE)
- err = ext4_fname_setup_ci_filename(dir, iname, fname);
-#endif
-
- return err;
+ return ext4_fname_setup_ci_filename(dir, iname, fname);
}
static inline int ext4_fname_prepare_lookup(struct inode *dir,
@@ -2766,10 +2800,7 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir,
static inline void ext4_fname_free_filename(struct ext4_filename *fname)
{
-#if IS_ENABLED(CONFIG_UNICODE)
- kfree(fname->cf_name.name);
- fname->cf_name.name = NULL;
-#endif
+ ext4_fname_free_ci_filename(fname);
}
static inline int ext4_ioctl_get_encryption_pwsalt(struct file *filp,
@@ -2893,10 +2924,10 @@ extern const struct seq_operations ext4_mb_seq_groups_ops;
extern const struct seq_operations ext4_mb_seq_structs_summary_ops;
extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset);
extern int ext4_mb_init(struct super_block *);
-extern int ext4_mb_release(struct super_block *);
+extern void ext4_mb_release(struct super_block *);
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
struct ext4_allocation_request *, int *);
-extern void ext4_discard_preallocations(struct inode *, unsigned int);
+extern void ext4_discard_preallocations(struct inode *);
extern int __init ext4_init_mballoc(void);
extern void ext4_exit_mballoc(void);
extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
@@ -2917,7 +2948,7 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
- int len, int state);
+ int len, bool state);
static inline bool ext4_mb_cr_expensive(enum criteria cr)
{
return cr >= CR_GOAL_LEN_SLOW;
@@ -2984,6 +3015,8 @@ extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
+extern int ext4_truncate_page_cache_block_range(struct inode *inode,
+ loff_t start, loff_t end);
extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
@@ -3338,6 +3371,13 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
return 1 << sbi->s_log_groups_per_flex;
}
+static inline loff_t ext4_get_maxbytes(struct inode *inode)
+{
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return inode->i_sb->s_maxbytes;
+ return EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+}
+
#define ext4_std_error(sb, errno) \
do { \
if ((errno)) \
@@ -3530,13 +3570,13 @@ int ext4_readpage_inline(struct inode *inode, struct folio *folio);
extern int ext4_try_to_write_inline_data(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
- struct page **pagep);
+ struct folio **foliop);
int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
unsigned copied, struct folio *folio);
extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
- struct page **pagep,
+ struct folio **foliop,
void **fsdata);
extern int ext4_try_add_inline_entry(handle_t *handle,
struct ext4_filename *fname,
@@ -3677,11 +3717,12 @@ extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
int num,
struct ext4_ext_path *path);
-extern int ext4_ext_insert_extent(handle_t *, struct inode *,
- struct ext4_ext_path **,
- struct ext4_extent *, int);
+extern struct ext4_ext_path *ext4_ext_insert_extent(
+ handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext, int gb_flags);
extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
- struct ext4_ext_path **,
+ struct ext4_ext_path *,
int flags);
extern void ext4_free_ext_path(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode);
@@ -3818,6 +3859,9 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh)
return buffer_uptodate(bh);
}
+extern int ext4_block_write_begin(handle_t *handle, struct folio *folio,
+ loff_t pos, unsigned len,
+ get_block_t *get_block);
#endif /* __KERNEL__ */
#define EFSBADCRC EBADMSG /* Bad CRC detected */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index d1a2e6624401..da4a82456383 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -206,7 +206,7 @@ static void ext4_journal_abort_handle(const char *caller, unsigned int line,
static void ext4_check_bdev_write_error(struct super_block *sb)
{
- struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+ struct address_space *mapping = sb->s_bdev->bd_mapping;
struct ext4_sb_info *sbi = EXT4_SB(sb);
int err;
@@ -235,8 +235,6 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
might_sleep();
- ext4_check_bdev_write_error(sb);
-
if (ext4_handle_valid(handle)) {
err = jbd2_journal_get_write_access(handle, bh);
if (err) {
@@ -244,7 +242,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
handle, err);
return err;
}
- }
+ } else
+ ext4_check_bdev_write_error(sb);
if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb))
return 0;
BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 5ea75af6ca22..2f9c3cd4f26c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -84,12 +84,11 @@ static void ext4_extent_block_csum_set(struct inode *inode,
et->et_checksum = ext4_extent_block_csum(inode, eh);
}
-static int ext4_split_extent_at(handle_t *handle,
- struct inode *inode,
- struct ext4_ext_path **ppath,
- ext4_lblk_t split,
- int split_flag,
- int flags);
+static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t split,
+ int split_flag, int flags);
static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
{
@@ -100,27 +99,33 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
* i_rwsem. So we can safely drop the i_data_sem here.
*/
BUG_ON(EXT4_JOURNAL(inode) == NULL);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
up_write(&EXT4_I(inode)->i_data_sem);
*dropped = 1;
return 0;
}
+static inline void ext4_ext_path_brelse(struct ext4_ext_path *path)
+{
+ brelse(path->p_bh);
+ path->p_bh = NULL;
+}
+
static void ext4_ext_drop_refs(struct ext4_ext_path *path)
{
int depth, i;
- if (!path)
+ if (IS_ERR_OR_NULL(path))
return;
depth = path->p_depth;
- for (i = 0; i <= depth; i++, path++) {
- brelse(path->p_bh);
- path->p_bh = NULL;
- }
+ for (i = 0; i <= depth; i++, path++)
+ ext4_ext_path_brelse(path);
}
void ext4_free_ext_path(struct ext4_ext_path *path)
{
+ if (IS_ERR_OR_NULL(path))
+ return;
ext4_ext_drop_refs(path);
kfree(path);
}
@@ -323,19 +328,18 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
return size;
}
-static inline int
+static inline struct ext4_ext_path *
ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
- struct ext4_ext_path **ppath, ext4_lblk_t lblk,
+ struct ext4_ext_path *path, ext4_lblk_t lblk,
int nofail)
{
- struct ext4_ext_path *path = *ppath;
int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO;
if (nofail)
flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL;
- return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
+ return ext4_split_extent_at(handle, inode, path, lblk, unwritten ?
EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
flags);
}
@@ -635,8 +639,7 @@ int ext4_ext_precache(struct inode *inode)
*/
if ((i == depth) ||
path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
- brelse(path[i].p_bh);
- path[i].p_bh = NULL;
+ ext4_ext_path_brelse(path + i);
i--;
continue;
}
@@ -689,7 +692,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
struct ext4_extent *ex;
int i;
- if (!path)
+ if (IS_ERR_OR_NULL(path))
return;
eh = path[depth].p_hdr;
@@ -881,11 +884,10 @@ void ext4_ext_tree_init(handle_t *handle, struct inode *inode)
struct ext4_ext_path *
ext4_find_extent(struct inode *inode, ext4_lblk_t block,
- struct ext4_ext_path **orig_path, int flags)
+ struct ext4_ext_path *path, int flags)
{
struct ext4_extent_header *eh;
struct buffer_head *bh;
- struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
short int depth, i, ppos = 0;
int ret;
gfp_t gfp_flags = GFP_NOFS;
@@ -906,7 +908,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
ext4_ext_drop_refs(path);
if (depth > path[0].p_maxdepth) {
kfree(path);
- *orig_path = path = NULL;
+ path = NULL;
}
}
if (!path) {
@@ -957,14 +959,10 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
ext4_ext_show_path(inode, path);
- if (orig_path)
- *orig_path = path;
return path;
err:
ext4_free_ext_path(path);
- if (orig_path)
- *orig_path = NULL;
return ERR_PTR(ret);
}
@@ -1397,15 +1395,15 @@ out:
* finds empty index and adds new leaf.
* if no free index is found, then it requests in-depth growing.
*/
-static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
- unsigned int mb_flags,
- unsigned int gb_flags,
- struct ext4_ext_path **ppath,
- struct ext4_extent *newext)
+static struct ext4_ext_path *
+ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
+ unsigned int mb_flags, unsigned int gb_flags,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext)
{
- struct ext4_ext_path *path = *ppath;
struct ext4_ext_path *curp;
int depth, i, err = 0;
+ ext4_lblk_t ee_block = le32_to_cpu(newext->ee_block);
repeat:
i = depth = ext_depth(inode);
@@ -1424,42 +1422,38 @@ repeat:
* entry: create all needed subtree and add new leaf */
err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
if (err)
- goto out;
+ goto errout;
/* refill path */
- path = ext4_find_extent(inode,
- (ext4_lblk_t)le32_to_cpu(newext->ee_block),
- ppath, gb_flags);
- if (IS_ERR(path))
- err = PTR_ERR(path);
- } else {
- /* tree is full, time to grow in depth */
- err = ext4_ext_grow_indepth(handle, inode, mb_flags);
- if (err)
- goto out;
+ path = ext4_find_extent(inode, ee_block, path, gb_flags);
+ return path;
+ }
- /* refill path */
- path = ext4_find_extent(inode,
- (ext4_lblk_t)le32_to_cpu(newext->ee_block),
- ppath, gb_flags);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- goto out;
- }
+ /* tree is full, time to grow in depth */
+ err = ext4_ext_grow_indepth(handle, inode, mb_flags);
+ if (err)
+ goto errout;
- /*
- * only first (depth 0 -> 1) produces free space;
- * in all other cases we have to split the grown tree
- */
- depth = ext_depth(inode);
- if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
- /* now we need to split */
- goto repeat;
- }
+ /* refill path */
+ path = ext4_find_extent(inode, ee_block, path, gb_flags);
+ if (IS_ERR(path))
+ return path;
+
+ /*
+ * only first (depth 0 -> 1) produces free space;
+ * in all other cases we have to split the grown tree
+ */
+ depth = ext_depth(inode);
+ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
+ /* now we need to split */
+ goto repeat;
}
-out:
- return err;
+ return path;
+
+errout:
+ ext4_free_ext_path(path);
+ return ERR_PTR(err);
}
/*
@@ -1751,12 +1745,23 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
break;
err = ext4_ext_get_access(handle, inode, path + k);
if (err)
- break;
+ goto clean;
path[k].p_idx->ei_block = border;
err = ext4_ext_dirty(handle, inode, path + k);
if (err)
- break;
+ goto clean;
}
+ return 0;
+
+clean:
+ /*
+ * The path[k].p_bh is either unmodified or with no verified bit
+ * set (see ext4_ext_get_access()). So just clear the verified bit
+ * of the successfully modified extents buffers, which will force
+ * these extents to be checked to avoid using inconsistent data.
+ */
+ while (++k < depth)
+ clear_buffer_verified(path[k].p_bh);
return err;
}
@@ -1878,8 +1883,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
(path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
path[0].p_hdr->eh_max = cpu_to_le16(max_root);
- brelse(path[1].p_bh);
- path[1].p_bh = NULL;
+ ext4_ext_path_brelse(path + 1);
ext4_free_blocks(handle, inode, NULL, blk, 1,
EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
}
@@ -1967,16 +1971,15 @@ out:
* inserts requested extent as new one into the tree,
* creating new leaf in the no-space case.
*/
-int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
- struct ext4_ext_path **ppath,
- struct ext4_extent *newext, int gb_flags)
+struct ext4_ext_path *
+ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext, int gb_flags)
{
- struct ext4_ext_path *path = *ppath;
struct ext4_extent_header *eh;
struct ext4_extent *ex, *fex;
struct ext4_extent *nearex; /* nearest extent */
- struct ext4_ext_path *npath = NULL;
- int depth, len, err;
+ int depth, len, err = 0;
ext4_lblk_t next;
int mb_flags = 0, unwritten;
@@ -1984,14 +1987,16 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
mb_flags |= EXT4_MB_DELALLOC_RESERVED;
if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
- return -EFSCORRUPTED;
+ err = -EFSCORRUPTED;
+ goto errout;
}
depth = ext_depth(inode);
ex = path[depth].p_ext;
eh = path[depth].p_hdr;
if (unlikely(path[depth].p_hdr == NULL)) {
EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
- return -EFSCORRUPTED;
+ err = -EFSCORRUPTED;
+ goto errout;
}
/* try to insert block into found extent and return */
@@ -2029,7 +2034,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
err = ext4_ext_get_access(handle, inode,
path + depth);
if (err)
- return err;
+ goto errout;
unwritten = ext4_ext_is_unwritten(ex);
ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ ext4_ext_get_actual_len(newext));
@@ -2054,7 +2059,7 @@ prepend:
err = ext4_ext_get_access(handle, inode,
path + depth);
if (err)
- return err;
+ goto errout;
unwritten = ext4_ext_is_unwritten(ex);
ex->ee_block = newext->ee_block;
@@ -2079,21 +2084,26 @@ prepend:
if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
next = ext4_ext_next_leaf_block(path);
if (next != EXT_MAX_BLOCKS) {
+ struct ext4_ext_path *npath;
+
ext_debug(inode, "next leaf block - %u\n", next);
- BUG_ON(npath != NULL);
npath = ext4_find_extent(inode, next, NULL, gb_flags);
- if (IS_ERR(npath))
- return PTR_ERR(npath);
+ if (IS_ERR(npath)) {
+ err = PTR_ERR(npath);
+ goto errout;
+ }
BUG_ON(npath->p_depth != path->p_depth);
eh = npath[depth].p_hdr;
if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
ext_debug(inode, "next leaf isn't full(%d)\n",
le16_to_cpu(eh->eh_entries));
+ ext4_free_ext_path(path);
path = npath;
goto has_space;
}
ext_debug(inode, "next leaf has no free space(%d,%d)\n",
le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
+ ext4_free_ext_path(npath);
}
/*
@@ -2102,11 +2112,10 @@ prepend:
*/
if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
mb_flags |= EXT4_MB_USE_RESERVED;
- err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
- ppath, newext);
- if (err)
- goto cleanup;
- path = *ppath;
+ path = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
+ path, newext);
+ if (IS_ERR(path))
+ return path;
depth = ext_depth(inode);
eh = path[depth].p_hdr;
@@ -2115,7 +2124,7 @@ has_space:
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
- goto cleanup;
+ goto errout;
if (!nearex) {
/* there is no extent in this leaf, create first one */
@@ -2173,17 +2182,20 @@ merge:
if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
ext4_ext_try_to_merge(handle, inode, path, nearex);
-
/* time to correct all indexes above */
err = ext4_ext_correct_indexes(handle, inode, path);
if (err)
- goto cleanup;
+ goto errout;
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+ if (err)
+ goto errout;
-cleanup:
- ext4_free_ext_path(npath);
- return err;
+ return path;
+
+errout:
+ ext4_free_ext_path(path);
+ return ERR_PTR(err);
}
static int ext4_fill_es_cache_info(struct inode *inode,
@@ -2283,27 +2295,26 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
{
int err;
ext4_fsblk_t leaf;
+ int k = depth - 1;
/* free index block */
- depth--;
- path = path + depth;
- leaf = ext4_idx_pblock(path->p_idx);
- if (unlikely(path->p_hdr->eh_entries == 0)) {
- EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
+ leaf = ext4_idx_pblock(path[k].p_idx);
+ if (unlikely(path[k].p_hdr->eh_entries == 0)) {
+ EXT4_ERROR_INODE(inode, "path[%d].p_hdr->eh_entries == 0", k);
return -EFSCORRUPTED;
}
- err = ext4_ext_get_access(handle, inode, path);
+ err = ext4_ext_get_access(handle, inode, path + k);
if (err)
return err;
- if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
- int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
+ if (path[k].p_idx != EXT_LAST_INDEX(path[k].p_hdr)) {
+ int len = EXT_LAST_INDEX(path[k].p_hdr) - path[k].p_idx;
len *= sizeof(struct ext4_extent_idx);
- memmove(path->p_idx, path->p_idx + 1, len);
+ memmove(path[k].p_idx, path[k].p_idx + 1, len);
}
- le16_add_cpu(&path->p_hdr->eh_entries, -1);
- err = ext4_ext_dirty(handle, inode, path);
+ le16_add_cpu(&path[k].p_hdr->eh_entries, -1);
+ err = ext4_ext_dirty(handle, inode, path + k);
if (err)
return err;
ext_debug(inode, "index is empty, remove it, free block %llu\n", leaf);
@@ -2312,18 +2323,29 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
ext4_free_blocks(handle, inode, NULL, leaf, 1,
EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
- while (--depth >= 0) {
- if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
+ while (--k >= 0) {
+ if (path[k + 1].p_idx != EXT_FIRST_INDEX(path[k + 1].p_hdr))
break;
- path--;
- err = ext4_ext_get_access(handle, inode, path);
+ err = ext4_ext_get_access(handle, inode, path + k);
if (err)
- break;
- path->p_idx->ei_block = (path+1)->p_idx->ei_block;
- err = ext4_ext_dirty(handle, inode, path);
+ goto clean;
+ path[k].p_idx->ei_block = path[k + 1].p_idx->ei_block;
+ err = ext4_ext_dirty(handle, inode, path + k);
if (err)
- break;
+ goto clean;
}
+ return 0;
+
+clean:
+ /*
+ * The path[k].p_bh is either unmodified or with no verified bit
+ * set (see ext4_ext_get_access()). So just clear the verified bit
+ * of the successfully modified extents buffers, which will force
+ * these extents to be checked to avoid using inconsistent data.
+ */
+ while (++k < depth)
+ clear_buffer_verified(path[k].p_bh);
+
return err;
}
@@ -2374,18 +2396,19 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
{
int index;
- int depth;
/* If we are converting the inline data, only one is needed here. */
if (ext4_has_inline_data(inode))
return 1;
- depth = ext_depth(inode);
-
+ /*
+ * Extent tree can change between the time we estimate credits and
+ * the time we actually modify the tree. Assume the worst case.
+ */
if (extents <= 1)
- index = depth * 2;
+ index = EXT4_MAX_EXTENT_DEPTH * 2;
else
- index = depth * 3;
+ index = EXT4_MAX_EXTENT_DEPTH * 3;
return index;
}
@@ -2876,11 +2899,12 @@ again:
* fail removing space due to ENOSPC so try to use
* reserved block if that happens.
*/
- err = ext4_force_split_extent_at(handle, inode, &path,
- end + 1, 1);
- if (err < 0)
+ path = ext4_force_split_extent_at(handle, inode, path,
+ end + 1, 1);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
goto out;
-
+ }
} else if (sbi->s_cluster_ratio > 1 && end >= ex_end &&
partial.state == initial) {
/*
@@ -2938,8 +2962,7 @@ again:
err = ext4_ext_rm_leaf(handle, inode, path,
&partial, start, end);
/* root level has p_bh == NULL, brelse() eats this */
- brelse(path[i].p_bh);
- path[i].p_bh = NULL;
+ ext4_ext_path_brelse(path + i);
i--;
continue;
}
@@ -3001,8 +3024,7 @@ again:
err = ext4_ext_rm_idx(handle, inode, path, i);
}
/* root level has p_bh == NULL, brelse() eats this */
- brelse(path[i].p_bh);
- path[i].p_bh = NULL;
+ ext4_ext_path_brelse(path + i);
i--;
ext_debug(inode, "return to level %d\n", i);
}
@@ -3117,7 +3139,7 @@ static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
return;
ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
- EXTENT_STATUS_WRITTEN);
+ EXTENT_STATUS_WRITTEN, 0);
}
/* FIXME!! we need to try to merge to left or right after zero-out */
@@ -3151,16 +3173,14 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
* a> the extent are splitted into two extent.
* b> split is not needed, and just mark the extent.
*
- * return 0 on success.
+ * Return an extent path pointer on success, or an error pointer on failure.
*/
-static int ext4_split_extent_at(handle_t *handle,
- struct inode *inode,
- struct ext4_ext_path **ppath,
- ext4_lblk_t split,
- int split_flag,
- int flags)
+static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t split,
+ int split_flag, int flags)
{
- struct ext4_ext_path *path = *ppath;
ext4_fsblk_t newblock;
ext4_lblk_t ee_block;
struct ext4_extent *ex, newex, orig_ex, zero_ex;
@@ -3230,24 +3250,27 @@ static int ext4_split_extent_at(handle_t *handle,
if (split_flag & EXT4_EXT_MARK_UNWRIT2)
ext4_ext_mark_unwritten(ex2);
- err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
- if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM)
+ path = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+ if (!IS_ERR(path))
goto out;
+ err = PTR_ERR(path);
+ if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM)
+ return path;
+
/*
- * Update path is required because previous ext4_ext_insert_extent()
- * may have freed or reallocated the path. Using EXT4_EX_NOFAIL
- * guarantees that ext4_find_extent() will not return -ENOMEM,
- * otherwise -ENOMEM will cause a retry in do_writepages(), and a
- * WARN_ON may be triggered in ext4_da_update_reserve_space() due to
- * an incorrect ee_len causing the i_reserved_data_blocks exception.
+ * Get a new path to try to zeroout or fix the extent length.
+ * Using EXT4_EX_NOFAIL guarantees that ext4_find_extent()
+ * will not return -ENOMEM, otherwise -ENOMEM will cause a
+ * retry in do_writepages(), and a WARN_ON may be triggered
+ * in ext4_da_update_reserve_space() due to an incorrect
+ * ee_len causing the i_reserved_data_blocks exception.
*/
- path = ext4_find_extent(inode, ee_block, ppath,
- flags | EXT4_EX_NOFAIL);
+ path = ext4_find_extent(inode, ee_block, NULL, flags | EXT4_EX_NOFAIL);
if (IS_ERR(path)) {
EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %ld",
split, PTR_ERR(path));
- return PTR_ERR(path);
+ return path;
}
depth = ext_depth(inode);
ex = path[depth].p_ext;
@@ -3302,10 +3325,13 @@ fix_extent_len:
* and err is a non-zero error code.
*/
ext4_ext_dirty(handle, inode, path + path->p_depth);
- return err;
out:
- ext4_ext_show_leaf(inode, *ppath);
- return err;
+ if (err) {
+ ext4_free_ext_path(path);
+ path = ERR_PTR(err);
+ }
+ ext4_ext_show_leaf(inode, path);
+ return path;
}
/*
@@ -3319,21 +3345,18 @@ out:
* c> Splits in three extents: Somone is splitting in middle of the extent
*
*/
-static int ext4_split_extent(handle_t *handle,
- struct inode *inode,
- struct ext4_ext_path **ppath,
- struct ext4_map_blocks *map,
- int split_flag,
- int flags)
+static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_map_blocks *map,
+ int split_flag, int flags,
+ unsigned int *allocated)
{
- struct ext4_ext_path *path = *ppath;
ext4_lblk_t ee_block;
struct ext4_extent *ex;
unsigned int ee_len, depth;
- int err = 0;
int unwritten;
int split_flag1, flags1;
- int allocated = map->m_len;
depth = ext_depth(inode);
ex = path[depth].p_ext;
@@ -3349,28 +3372,27 @@ static int ext4_split_extent(handle_t *handle,
EXT4_EXT_MARK_UNWRIT2;
if (split_flag & EXT4_EXT_DATA_VALID2)
split_flag1 |= EXT4_EXT_DATA_VALID1;
- err = ext4_split_extent_at(handle, inode, ppath,
+ path = ext4_split_extent_at(handle, inode, path,
map->m_lblk + map->m_len, split_flag1, flags1);
- if (err)
- goto out;
- } else {
- allocated = ee_len - (map->m_lblk - ee_block);
- }
- /*
- * Update path is required because previous ext4_split_extent_at() may
- * result in split of original leaf or extent zeroout.
- */
- path = ext4_find_extent(inode, map->m_lblk, ppath, flags);
- if (IS_ERR(path))
- return PTR_ERR(path);
- depth = ext_depth(inode);
- ex = path[depth].p_ext;
- if (!ex) {
- EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
- (unsigned long) map->m_lblk);
- return -EFSCORRUPTED;
+ if (IS_ERR(path))
+ return path;
+ /*
+ * Update path is required because previous ext4_split_extent_at
+ * may result in split of original leaf or extent zeroout.
+ */
+ path = ext4_find_extent(inode, map->m_lblk, path, flags);
+ if (IS_ERR(path))
+ return path;
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ if (!ex) {
+ EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+ (unsigned long) map->m_lblk);
+ ext4_free_ext_path(path);
+ return ERR_PTR(-EFSCORRUPTED);
+ }
+ unwritten = ext4_ext_is_unwritten(ex);
}
- unwritten = ext4_ext_is_unwritten(ex);
if (map->m_lblk >= ee_block) {
split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
@@ -3379,15 +3401,20 @@ static int ext4_split_extent(handle_t *handle,
split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
EXT4_EXT_MARK_UNWRIT2);
}
- err = ext4_split_extent_at(handle, inode, ppath,
+ path = ext4_split_extent_at(handle, inode, path,
map->m_lblk, split_flag1, flags);
- if (err)
- goto out;
+ if (IS_ERR(path))
+ return path;
}
- ext4_ext_show_leaf(inode, *ppath);
-out:
- return err ? err : allocated;
+ if (allocated) {
+ if (map->m_lblk + map->m_len > ee_block + ee_len)
+ *allocated = ee_len - (map->m_lblk - ee_block);
+ else
+ *allocated = map->m_len;
+ }
+ ext4_ext_show_leaf(inode, path);
+ return path;
}
/*
@@ -3410,13 +3437,11 @@ out:
* that are allocated and initialized.
* It is guaranteed to be >= map->m_len.
*/
-static int ext4_ext_convert_to_initialized(handle_t *handle,
- struct inode *inode,
- struct ext4_map_blocks *map,
- struct ext4_ext_path **ppath,
- int flags)
+static struct ext4_ext_path *
+ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, struct ext4_ext_path *path,
+ int flags, unsigned int *allocated)
{
- struct ext4_ext_path *path = *ppath;
struct ext4_sb_info *sbi;
struct ext4_extent_header *eh;
struct ext4_map_blocks split_map;
@@ -3426,7 +3451,6 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
unsigned int ee_len, depth, map_len = map->m_len;
int err = 0;
int split_flag = EXT4_EXT_DATA_VALID2;
- int allocated = 0;
unsigned int max_zeroout = 0;
ext_debug(inode, "logical block %llu, max_blocks %u\n",
@@ -3467,6 +3491,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
* - L2: we only attempt to merge with an extent stored in the
* same extent tree node.
*/
+ *allocated = 0;
if ((map->m_lblk == ee_block) &&
/* See if we can merge left */
(map_len < ee_len) && /*L1*/
@@ -3496,7 +3521,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
(prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
- goto out;
+ goto errout;
trace_ext4_ext_convert_to_initialized_fastpath(inode,
map, ex, abut_ex);
@@ -3511,7 +3536,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
/* Result: number of initialized blocks past m_lblk */
- allocated = map_len;
+ *allocated = map_len;
}
} else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
(map_len < ee_len) && /*L1*/
@@ -3542,7 +3567,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
(next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
- goto out;
+ goto errout;
trace_ext4_ext_convert_to_initialized_fastpath(inode,
map, ex, abut_ex);
@@ -3557,18 +3582,20 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
abut_ex->ee_len = cpu_to_le16(next_len + map_len);
/* Result: number of initialized blocks past m_lblk */
- allocated = map_len;
+ *allocated = map_len;
}
}
- if (allocated) {
+ if (*allocated) {
/* Mark the block containing both extents as dirty */
err = ext4_ext_dirty(handle, inode, path + depth);
/* Update path to point to the right extent */
path[depth].p_ext = abut_ex;
+ if (err)
+ goto errout;
goto out;
} else
- allocated = ee_len - (map->m_lblk - ee_block);
+ *allocated = ee_len - (map->m_lblk - ee_block);
WARN_ON(map->m_lblk < ee_block);
/*
@@ -3595,21 +3622,21 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
split_map.m_lblk = map->m_lblk;
split_map.m_len = map->m_len;
- if (max_zeroout && (allocated > split_map.m_len)) {
- if (allocated <= max_zeroout) {
+ if (max_zeroout && (*allocated > split_map.m_len)) {
+ if (*allocated <= max_zeroout) {
/* case 3 or 5 */
zero_ex1.ee_block =
cpu_to_le32(split_map.m_lblk +
split_map.m_len);
zero_ex1.ee_len =
- cpu_to_le16(allocated - split_map.m_len);
+ cpu_to_le16(*allocated - split_map.m_len);
ext4_ext_store_pblock(&zero_ex1,
ext4_ext_pblock(ex) + split_map.m_lblk +
split_map.m_len - ee_block);
err = ext4_ext_zeroout(inode, &zero_ex1);
if (err)
goto fallback;
- split_map.m_len = allocated;
+ split_map.m_len = *allocated;
}
if (split_map.m_lblk - ee_block + split_map.m_len <
max_zeroout) {
@@ -3627,22 +3654,24 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
split_map.m_len += split_map.m_lblk - ee_block;
split_map.m_lblk = ee_block;
- allocated = map->m_len;
+ *allocated = map->m_len;
}
}
fallback:
- err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
- flags);
- if (err > 0)
- err = 0;
+ path = ext4_split_extent(handle, inode, path, &split_map, split_flag,
+ flags, NULL);
+ if (IS_ERR(path))
+ return path;
out:
/* If we have gotten a failure, don't zero out status tree */
- if (!err) {
- ext4_zeroout_es(inode, &zero_ex1);
- ext4_zeroout_es(inode, &zero_ex2);
- }
- return err ? err : allocated;
+ ext4_zeroout_es(inode, &zero_ex1);
+ ext4_zeroout_es(inode, &zero_ex2);
+ return path;
+
+errout:
+ ext4_free_ext_path(path);
+ return ERR_PTR(err);
}
/*
@@ -3667,15 +3696,16 @@ out:
* being filled will be convert to initialized by the end_io callback function
* via ext4_convert_unwritten_extents().
*
- * Returns the size of unwritten extent to be written on success.
+ * The size of unwritten extent to be written is passed to the caller via the
+ * allocated pointer. Return an extent path pointer on success, or an error
+ * pointer on failure.
*/
-static int ext4_split_convert_extents(handle_t *handle,
+static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle,
struct inode *inode,
struct ext4_map_blocks *map,
- struct ext4_ext_path **ppath,
- int flags)
+ struct ext4_ext_path *path,
+ int flags, unsigned int *allocated)
{
- struct ext4_ext_path *path = *ppath;
ext4_lblk_t eof_block;
ext4_lblk_t ee_block;
struct ext4_extent *ex;
@@ -3708,15 +3738,15 @@ static int ext4_split_convert_extents(handle_t *handle,
split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
}
flags |= EXT4_GET_BLOCKS_PRE_IO;
- return ext4_split_extent(handle, inode, ppath, map, split_flag, flags);
+ return ext4_split_extent(handle, inode, path, map, split_flag, flags,
+ allocated);
}
-static int ext4_convert_unwritten_extents_endio(handle_t *handle,
- struct inode *inode,
- struct ext4_map_blocks *map,
- struct ext4_ext_path **ppath)
+static struct ext4_ext_path *
+ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path *path)
{
- struct ext4_ext_path *path = *ppath;
struct ext4_extent *ex;
ext4_lblk_t ee_block;
unsigned int ee_len;
@@ -3744,20 +3774,21 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
inode->i_ino, (unsigned long long)ee_block, ee_len,
(unsigned long long)map->m_lblk, map->m_len);
#endif
- err = ext4_split_convert_extents(handle, inode, map, ppath,
- EXT4_GET_BLOCKS_CONVERT);
- if (err < 0)
- return err;
- path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
+ path = ext4_split_convert_extents(handle, inode, map, path,
+ EXT4_GET_BLOCKS_CONVERT, NULL);
if (IS_ERR(path))
- return PTR_ERR(path);
+ return path;
+
+ path = ext4_find_extent(inode, map->m_lblk, path, 0);
+ if (IS_ERR(path))
+ return path;
depth = ext_depth(inode);
ex = path[depth].p_ext;
}
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
- goto out;
+ goto errout;
/* first mark the extent as initialized */
ext4_ext_mark_initialized(ex);
@@ -3768,18 +3799,23 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
/* Mark modified extent as dirty */
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
-out:
+ if (err)
+ goto errout;
+
ext4_ext_show_leaf(inode, path);
- return err;
+ return path;
+
+errout:
+ ext4_free_ext_path(path);
+ return ERR_PTR(err);
}
-static int
+static struct ext4_ext_path *
convert_initialized_extent(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
- struct ext4_ext_path **ppath,
+ struct ext4_ext_path *path,
unsigned int *allocated)
{
- struct ext4_ext_path *path = *ppath;
struct ext4_extent *ex;
ext4_lblk_t ee_block;
unsigned int ee_len;
@@ -3802,25 +3838,27 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
(unsigned long long)ee_block, ee_len);
if (ee_block != map->m_lblk || ee_len > map->m_len) {
- err = ext4_split_convert_extents(handle, inode, map, ppath,
- EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
- if (err < 0)
- return err;
- path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
+ path = ext4_split_convert_extents(handle, inode, map, path,
+ EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, NULL);
if (IS_ERR(path))
- return PTR_ERR(path);
+ return path;
+
+ path = ext4_find_extent(inode, map->m_lblk, path, 0);
+ if (IS_ERR(path))
+ return path;
depth = ext_depth(inode);
ex = path[depth].p_ext;
if (!ex) {
EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
(unsigned long) map->m_lblk);
- return -EFSCORRUPTED;
+ err = -EFSCORRUPTED;
+ goto errout;
}
}
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
- return err;
+ goto errout;
/* first mark the extent as unwritten */
ext4_ext_mark_unwritten(ex);
@@ -3832,7 +3870,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
/* Mark modified extent as dirty */
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
if (err)
- return err;
+ goto errout;
ext4_ext_show_leaf(inode, path);
ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -3841,22 +3879,25 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
if (*allocated > map->m_len)
*allocated = map->m_len;
map->m_len = *allocated;
- return 0;
+ return path;
+
+errout:
+ ext4_free_ext_path(path);
+ return ERR_PTR(err);
}
-static int
+static struct ext4_ext_path *
ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
- struct ext4_ext_path **ppath, int flags,
- unsigned int allocated, ext4_fsblk_t newblock)
+ struct ext4_ext_path *path, int flags,
+ unsigned int *allocated, ext4_fsblk_t newblock)
{
- int ret = 0;
int err = 0;
ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n",
(unsigned long long)map->m_lblk, map->m_len, flags,
- allocated);
- ext4_ext_show_leaf(inode, *ppath);
+ *allocated);
+ ext4_ext_show_leaf(inode, path);
/*
* When writing into unwritten space, we should not fail to
@@ -3865,36 +3906,34 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
- allocated, newblock);
+ *allocated, newblock);
/* get_block() before submitting IO, split the extent */
if (flags & EXT4_GET_BLOCKS_PRE_IO) {
- ret = ext4_split_convert_extents(handle, inode, map, ppath,
- flags | EXT4_GET_BLOCKS_CONVERT);
- if (ret < 0) {
- err = ret;
- goto out2;
- }
+ path = ext4_split_convert_extents(handle, inode, map, path,
+ flags | EXT4_GET_BLOCKS_CONVERT, allocated);
+ if (IS_ERR(path))
+ return path;
/*
- * shouldn't get a 0 return when splitting an extent unless
+ * shouldn't get a 0 allocated when splitting an extent unless
* m_len is 0 (bug) or extent has been corrupted
*/
- if (unlikely(ret == 0)) {
+ if (unlikely(*allocated == 0)) {
EXT4_ERROR_INODE(inode,
- "unexpected ret == 0, m_len = %u",
+ "unexpected allocated == 0, m_len = %u",
map->m_len);
err = -EFSCORRUPTED;
- goto out2;
+ goto errout;
}
map->m_flags |= EXT4_MAP_UNWRITTEN;
goto out;
}
/* IO end_io complete, convert the filled extent to written */
if (flags & EXT4_GET_BLOCKS_CONVERT) {
- err = ext4_convert_unwritten_extents_endio(handle, inode, map,
- ppath);
- if (err < 0)
- goto out2;
+ path = ext4_convert_unwritten_extents_endio(handle, inode,
+ map, path);
+ if (IS_ERR(path))
+ return path;
ext4_update_inode_fsync_trans(handle, inode, 1);
goto map_out;
}
@@ -3926,36 +3965,37 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
* For buffered writes, at writepage time, etc. Convert a
* discovered unwritten extent to written.
*/
- ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
- if (ret < 0) {
- err = ret;
- goto out2;
- }
+ path = ext4_ext_convert_to_initialized(handle, inode, map, path,
+ flags, allocated);
+ if (IS_ERR(path))
+ return path;
ext4_update_inode_fsync_trans(handle, inode, 1);
/*
- * shouldn't get a 0 return when converting an unwritten extent
+ * shouldn't get a 0 allocated when converting an unwritten extent
* unless m_len is 0 (bug) or extent has been corrupted
*/
- if (unlikely(ret == 0)) {
- EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u",
+ if (unlikely(*allocated == 0)) {
+ EXT4_ERROR_INODE(inode, "unexpected allocated == 0, m_len = %u",
map->m_len);
err = -EFSCORRUPTED;
- goto out2;
+ goto errout;
}
out:
- allocated = ret;
map->m_flags |= EXT4_MAP_NEW;
map_out:
map->m_flags |= EXT4_MAP_MAPPED;
out1:
map->m_pblk = newblock;
- if (allocated > map->m_len)
- allocated = map->m_len;
- map->m_len = allocated;
- ext4_ext_show_leaf(inode, *ppath);
-out2:
- return err ? err : allocated;
+ if (*allocated > map->m_len)
+ *allocated = map->m_len;
+ map->m_len = *allocated;
+ ext4_ext_show_leaf(inode, path);
+ return path;
+
+errout:
+ ext4_free_ext_path(path);
+ return ERR_PTR(err);
}
/*
@@ -4098,8 +4138,11 @@ again:
/*
* The delalloc extent containing lblk, it must have been
* added after ext4_map_blocks() checked the extent status
- * tree, adjust the length to the delalloc extent's after
- * lblk.
+ * tree so we are not holding i_rwsem and delalloc info is
+ * only stabilized by i_data_sem we are going to release
+ * soon. Don't modify the extent status tree and report
+ * extent as a hole, just adjust the length to the delalloc
+ * extent's after lblk.
*/
len = es.es_lblk + es.es_len - lblk;
return len;
@@ -4115,7 +4158,8 @@ again:
insert_hole:
/* Put just found gap into cache to speed up subsequent requests */
ext_debug(inode, " -> %u:%u\n", hole_start, len);
- ext4_es_insert_extent(inode, hole_start, len, ~0, EXTENT_STATUS_HOLE);
+ ext4_es_insert_extent(inode, hole_start, len, ~0,
+ EXTENT_STATUS_HOLE, 0);
/* Update hole_len to reflect hole size after lblk */
if (hole_start != lblk)
@@ -4130,10 +4174,10 @@ insert_hole:
*
* Need to be called with
* down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
- * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
+ * (ie, flags is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
*
* return > 0, number of blocks already mapped/allocated
- * if create == 0 and these are pre-allocated blocks
+ * if flags doesn't contain EXT4_GET_BLOCKS_CREATE and these are pre-allocated blocks
* buffer head is unmapped
* otherwise blocks are mapped
*
@@ -4149,7 +4193,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_extent newex, *ex, ex2;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_fsblk_t newblock = 0, pblk;
- int err = 0, depth, ret;
+ int err = 0, depth;
unsigned int allocated = 0, offset = 0;
unsigned int allocated_clusters = 0;
struct ext4_allocation_request ar;
@@ -4162,7 +4206,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
if (IS_ERR(path)) {
err = PTR_ERR(path);
- path = NULL;
goto out;
}
@@ -4211,8 +4254,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
*/
if ((!ext4_ext_is_unwritten(ex)) &&
(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
- err = convert_initialized_extent(handle,
- inode, map, &path, &allocated);
+ path = convert_initialized_extent(handle,
+ inode, map, path, &allocated);
+ if (IS_ERR(path))
+ err = PTR_ERR(path);
goto out;
} else if (!ext4_ext_is_unwritten(ex)) {
map->m_flags |= EXT4_MAP_MAPPED;
@@ -4224,20 +4269,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
goto out;
}
- ret = ext4_ext_handle_unwritten_extents(
- handle, inode, map, &path, flags,
- allocated, newblock);
- if (ret < 0)
- err = ret;
- else
- allocated = ret;
+ path = ext4_ext_handle_unwritten_extents(
+ handle, inode, map, path, flags,
+ &allocated, newblock);
+ if (IS_ERR(path))
+ err = PTR_ERR(path);
goto out;
}
}
/*
* requested block isn't allocated yet;
- * we couldn't try to create block if create flag is zero
+ * we couldn't try to create block if flags doesn't contain EXT4_GET_BLOCKS_CREATE
*/
if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
ext4_lblk_t len;
@@ -4282,6 +4325,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) {
ar.len = allocated = map->m_len;
newblock = map->m_pblk;
+ err = 0;
goto got_allocated_blocks;
}
@@ -4354,8 +4398,9 @@ got_allocated_blocks:
map->m_flags |= EXT4_MAP_UNWRITTEN;
}
- err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags);
- if (err) {
+ path = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
if (allocated_clusters) {
int fb_flags = 0;
@@ -4364,7 +4409,7 @@ got_allocated_blocks:
* not a good idea to call discard here directly,
* but otherwise we'd need to call it every free().
*/
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE;
ext4_free_blocks(handle, inode, NULL, newblock,
@@ -4375,43 +4420,6 @@ got_allocated_blocks:
}
/*
- * Reduce the reserved cluster count to reflect successful deferred
- * allocation of delayed allocated clusters or direct allocation of
- * clusters discovered to be delayed allocated. Once allocated, a
- * cluster is not included in the reserved count.
- */
- if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) {
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
- /*
- * When allocating delayed allocated clusters, simply
- * reduce the reserved cluster count and claim quota
- */
- ext4_da_update_reserve_space(inode, allocated_clusters,
- 1);
- } else {
- ext4_lblk_t lblk, len;
- unsigned int n;
-
- /*
- * When allocating non-delayed allocated clusters
- * (from fallocate, filemap, DIO, or clusters
- * allocated when delalloc has been disabled by
- * ext4_nonda_switch), reduce the reserved cluster
- * count by the number of allocated clusters that
- * have previously been delayed allocated. Quota
- * has been claimed by ext4_mb_new_blocks() above,
- * so release the quota reservations made for any
- * previously delayed allocated clusters.
- */
- lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
- len = allocated_clusters << sbi->s_cluster_bits;
- n = ext4_es_delayed_clu(inode, lblk, len);
- if (n > 0)
- ext4_da_update_reserve_space(inode, (int) n, 0);
- }
- }
-
- /*
* Cache the extent and update transaction to commit on fdatasync only
* when it is _not_ an unwritten extent.
*/
@@ -4475,7 +4483,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
int depth = 0;
struct ext4_map_blocks map;
unsigned int credits;
- loff_t epos;
+ loff_t epos, old_size = i_size_read(inode);
BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
map.m_lblk = offset;
@@ -4532,7 +4540,13 @@ retry:
if (epos > new_size)
epos = new_size;
if (ext4_update_inode_size(inode, epos) & 0x1)
- inode->i_mtime = inode_get_ctime(inode);
+ inode_set_mtime_to_ts(inode,
+ inode_get_ctime(inode));
+ if (epos > old_size) {
+ pagecache_isize_extended(inode, old_size, epos);
+ ext4_zero_partial_blocks(handle, inode,
+ old_size, epos - old_size);
+ }
}
ret2 = ext4_mark_inode_dirty(handle, inode);
ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -4555,131 +4569,65 @@ static long ext4_zero_range(struct file *file, loff_t offset,
loff_t len, int mode)
{
struct inode *inode = file_inode(file);
- struct address_space *mapping = file->f_mapping;
handle_t *handle = NULL;
- unsigned int max_blocks;
loff_t new_size = 0;
- int ret = 0;
- int flags;
- int credits;
- int partial_begin, partial_end;
- loff_t start, end;
- ext4_lblk_t lblk;
+ loff_t end = offset + len;
+ ext4_lblk_t start_lblk, end_lblk;
+ unsigned int blocksize = i_blocksize(inode);
unsigned int blkbits = inode->i_blkbits;
+ int ret, flags, credits;
trace_ext4_zero_range(inode, offset, len, mode);
+ WARN_ON_ONCE(!inode_is_locked(inode));
- /*
- * Round up offset. This is not fallocate, we need to zero out
- * blocks, so convert interior block aligned part of the range to
- * unwritten and possibly manually zero out unaligned parts of the
- * range. Here, start and partial_begin are inclusive, end and
- * partial_end are exclusive.
- */
- start = round_up(offset, 1 << blkbits);
- end = round_down((offset + len), 1 << blkbits);
-
- if (start < offset || end > offset + len)
- return -EINVAL;
- partial_begin = offset & ((1 << blkbits) - 1);
- partial_end = (offset + len) & ((1 << blkbits) - 1);
-
- lblk = start >> blkbits;
- max_blocks = (end >> blkbits);
- if (max_blocks < lblk)
- max_blocks = 0;
- else
- max_blocks -= lblk;
-
- inode_lock(inode);
-
- /*
- * Indirect files do not support unwritten extents
- */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
- ret = -EOPNOTSUPP;
- goto out_mutex;
- }
+ /* Indirect files do not support unwritten extents */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return -EOPNOTSUPP;
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- (offset + len > inode->i_size ||
- offset + len > EXT4_I(inode)->i_disksize)) {
- new_size = offset + len;
+ (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) {
+ new_size = end;
ret = inode_newsize_ok(inode, new_size);
if (ret)
- goto out_mutex;
+ return ret;
}
flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
-
- /* Wait all existing dio workers, newcomers will block on i_rwsem */
- inode_dio_wait(inode);
-
- ret = file_modified(file);
- if (ret)
- goto out_mutex;
-
/* Preallocate the range including the unaligned edges */
- if (partial_begin || partial_end) {
- ret = ext4_alloc_file_blocks(file,
- round_down(offset, 1 << blkbits) >> blkbits,
- (round_up((offset + len), 1 << blkbits) -
- round_down(offset, 1 << blkbits)) >> blkbits,
- new_size, flags);
- if (ret)
- goto out_mutex;
+ if (!IS_ALIGNED(offset | end, blocksize)) {
+ ext4_lblk_t alloc_lblk = offset >> blkbits;
+ ext4_lblk_t len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits);
+ ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk,
+ new_size, flags);
+ if (ret)
+ return ret;
}
- /* Zero range excluding the unaligned edges */
- if (max_blocks > 0) {
- flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
- EXT4_EX_NOCACHE);
-
- /*
- * Prevent page faults from reinstantiating pages we have
- * released from page cache.
- */
- filemap_invalidate_lock(mapping);
-
- ret = ext4_break_layouts(inode);
- if (ret) {
- filemap_invalidate_unlock(mapping);
- goto out_mutex;
- }
-
- ret = ext4_update_disksize_before_punch(inode, offset, len);
- if (ret) {
- filemap_invalidate_unlock(mapping);
- goto out_mutex;
- }
-
- /*
- * For journalled data we need to write (and checkpoint) pages
- * before discarding page cache to avoid inconsitent data on
- * disk in case of crash before zeroing trans is committed.
- */
- if (ext4_should_journal_data(inode)) {
- ret = filemap_write_and_wait_range(mapping, start,
- end - 1);
- if (ret) {
- filemap_invalidate_unlock(mapping);
- goto out_mutex;
- }
- }
+ ret = ext4_update_disksize_before_punch(inode, offset, len);
+ if (ret)
+ return ret;
- /* Now release the pages and zero block aligned part of pages */
- truncate_pagecache_range(inode, start, end - 1);
- inode->i_mtime = inode_set_ctime_current(inode);
+ /* Now release the pages and zero block aligned part of pages */
+ ret = ext4_truncate_page_cache_block_range(inode, offset, end);
+ if (ret)
+ return ret;
- ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
- flags);
- filemap_invalidate_unlock(mapping);
+ /* Zero range excluding the unaligned edges */
+ start_lblk = EXT4_B_TO_LBLK(inode, offset);
+ end_lblk = end >> blkbits;
+ if (end_lblk > start_lblk) {
+ ext4_lblk_t zero_blks = end_lblk - start_lblk;
+
+ flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE);
+ ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks,
+ new_size, flags);
if (ret)
- goto out_mutex;
+ return ret;
}
- if (!partial_begin && !partial_end)
- goto out_mutex;
+ /* Finish zeroing out if it doesn't contain partial block */
+ if (IS_ALIGNED(offset | end, blocksize))
+ return ret;
/*
* In worst case we have to writeout two nonadjacent unwritten
@@ -4692,27 +4640,69 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_std_error(inode->i_sb, ret);
- goto out_mutex;
+ return ret;
}
- inode->i_mtime = inode_set_ctime_current(inode);
+ /* Zero out partial block at the edges of the range */
+ ret = ext4_zero_partial_blocks(handle, inode, offset, len);
+ if (ret)
+ goto out_handle;
+
if (new_size)
ext4_update_inode_size(inode, new_size);
ret = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret))
goto out_handle;
- /* Zero out partial block at the edges of the range */
- ret = ext4_zero_partial_blocks(handle, inode, offset, len);
- if (ret >= 0)
- ext4_update_inode_fsync_trans(handle, inode, 1);
+ ext4_update_inode_fsync_trans(handle, inode, 1);
if (file->f_flags & O_SYNC)
ext4_handle_sync(handle);
out_handle:
ext4_journal_stop(handle);
-out_mutex:
- inode_unlock(inode);
+ return ret;
+}
+
+static long ext4_do_fallocate(struct file *file, loff_t offset,
+ loff_t len, int mode)
+{
+ struct inode *inode = file_inode(file);
+ loff_t end = offset + len;
+ loff_t new_size = 0;
+ ext4_lblk_t start_lblk, len_lblk;
+ int ret;
+
+ trace_ext4_fallocate_enter(inode, offset, len, mode);
+ WARN_ON_ONCE(!inode_is_locked(inode));
+
+ start_lblk = offset >> inode->i_blkbits;
+ len_lblk = EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits);
+
+ /* We only support preallocation for extent-based files only. */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) {
+ new_size = end;
+ ret = inode_newsize_ok(inode, new_size);
+ if (ret)
+ goto out;
+ }
+
+ ret = ext4_alloc_file_blocks(file, start_lblk, len_lblk, new_size,
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
+ if (ret)
+ goto out;
+
+ if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
+ ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
+ EXT4_I(inode)->i_sync_tid);
+ }
+out:
+ trace_ext4_fallocate_exit(inode, offset, len_lblk, ret);
return ret;
}
@@ -4726,12 +4716,8 @@ out_mutex:
long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
- loff_t new_size = 0;
- unsigned int max_blocks;
- int ret = 0;
- int flags;
- ext4_lblk_t lblk;
- unsigned int blkbits = inode->i_blkbits;
+ struct address_space *mapping = file->f_mapping;
+ int ret;
/*
* Encrypted inodes can't handle collapse range or insert
@@ -4751,73 +4737,47 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
inode_lock(inode);
ret = ext4_convert_inline_data(inode);
- inode_unlock(inode);
if (ret)
- goto exit;
-
- if (mode & FALLOC_FL_PUNCH_HOLE) {
- ret = ext4_punch_hole(file, offset, len);
- goto exit;
- }
+ goto out_inode_lock;
- if (mode & FALLOC_FL_COLLAPSE_RANGE) {
- ret = ext4_collapse_range(file, offset, len);
- goto exit;
- }
+ /* Wait all existing dio workers, newcomers will block on i_rwsem */
+ inode_dio_wait(inode);
- if (mode & FALLOC_FL_INSERT_RANGE) {
- ret = ext4_insert_range(file, offset, len);
- goto exit;
- }
+ ret = file_modified(file);
+ if (ret)
+ goto out_inode_lock;
- if (mode & FALLOC_FL_ZERO_RANGE) {
- ret = ext4_zero_range(file, offset, len, mode);
- goto exit;
+ if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ALLOCATE_RANGE) {
+ ret = ext4_do_fallocate(file, offset, len, mode);
+ goto out_inode_lock;
}
- trace_ext4_fallocate_enter(inode, offset, len, mode);
- lblk = offset >> blkbits;
-
- max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
- flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
-
- inode_lock(inode);
/*
- * We only support preallocation for extent-based files only
+ * Follow-up operations will drop page cache, hold invalidate lock
+ * to prevent page faults from reinstantiating pages we have
+ * released from page cache.
*/
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
- ret = -EOPNOTSUPP;
- goto out;
- }
-
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- (offset + len > inode->i_size ||
- offset + len > EXT4_I(inode)->i_disksize)) {
- new_size = offset + len;
- ret = inode_newsize_ok(inode, new_size);
- if (ret)
- goto out;
- }
-
- /* Wait all existing dio workers, newcomers will block on i_rwsem */
- inode_dio_wait(inode);
+ filemap_invalidate_lock(mapping);
- ret = file_modified(file);
+ ret = ext4_break_layouts(inode);
if (ret)
- goto out;
+ goto out_invalidate_lock;
- ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
- if (ret)
- goto out;
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ ret = ext4_punch_hole(file, offset, len);
+ else if (mode & FALLOC_FL_COLLAPSE_RANGE)
+ ret = ext4_collapse_range(file, offset, len);
+ else if (mode & FALLOC_FL_INSERT_RANGE)
+ ret = ext4_insert_range(file, offset, len);
+ else if (mode & FALLOC_FL_ZERO_RANGE)
+ ret = ext4_zero_range(file, offset, len, mode);
+ else
+ ret = -EOPNOTSUPP;
- if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
- ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
- EXT4_I(inode)->i_sync_tid);
- }
-out:
+out_invalidate_lock:
+ filemap_invalidate_unlock(mapping);
+out_inode_lock:
inode_unlock(inode);
- trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
-exit:
return ret;
}
@@ -4972,12 +4932,7 @@ static const struct iomap_ops ext4_iomap_xattr_ops = {
static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len)
{
- u64 maxbytes;
-
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- maxbytes = inode->i_sb->s_maxbytes;
- else
- maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+ u64 maxbytes = ext4_get_maxbytes(inode);
if (*len == 0)
return -EINVAL;
@@ -5201,7 +5156,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
* won't be shifted beyond EXT_MAX_BLOCKS.
*/
if (SHIFT == SHIFT_LEFT) {
- path = ext4_find_extent(inode, start - 1, &path,
+ path = ext4_find_extent(inode, start - 1, path,
EXT4_EX_NOCACHE);
if (IS_ERR(path))
return PTR_ERR(path);
@@ -5250,7 +5205,7 @@ again:
* becomes NULL to indicate the end of the loop.
*/
while (iterator && start <= stop) {
- path = ext4_find_extent(inode, *iterator, &path,
+ path = ext4_find_extent(inode, *iterator, path,
EXT4_EX_NOCACHE);
if (IS_ERR(path))
return PTR_ERR(path);
@@ -5319,109 +5274,72 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct address_space *mapping = inode->i_mapping;
- ext4_lblk_t punch_start, punch_stop;
+ loff_t end = offset + len;
+ ext4_lblk_t start_lblk, end_lblk;
handle_t *handle;
unsigned int credits;
- loff_t new_size, ioffset;
+ loff_t start, new_size;
int ret;
- /*
- * We need to test this early because xfstests assumes that a
- * collapse range of (0, 1) will return EOPNOTSUPP if the file
- * system does not support collapse range.
- */
+ trace_ext4_collapse_range(inode, offset, len);
+ WARN_ON_ONCE(!inode_is_locked(inode));
+
+ /* Currently just for extent based files */
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return -EOPNOTSUPP;
-
/* Collapse range works only on fs cluster size aligned regions. */
if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
return -EINVAL;
-
- trace_ext4_collapse_range(inode, offset, len);
-
- punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
- punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
-
- inode_lock(inode);
/*
* There is no need to overlap collapse range with EOF, in which case
* it is effectively a truncate operation
*/
- if (offset + len >= inode->i_size) {
- ret = -EINVAL;
- goto out_mutex;
- }
-
- /* Currently just for extent based files */
- if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- ret = -EOPNOTSUPP;
- goto out_mutex;
- }
-
- /* Wait for existing dio to complete */
- inode_dio_wait(inode);
-
- ret = file_modified(file);
- if (ret)
- goto out_mutex;
-
- /*
- * Prevent page faults from reinstantiating pages we have released from
- * page cache.
- */
- filemap_invalidate_lock(mapping);
-
- ret = ext4_break_layouts(inode);
- if (ret)
- goto out_mmap;
+ if (end >= inode->i_size)
+ return -EINVAL;
/*
+ * Write tail of the last page before removed range and data that
+ * will be shifted since they will get removed from the page cache
+ * below. We are also protected from pages becoming dirty by
+ * i_rwsem and invalidate_lock.
* Need to round down offset to be aligned with page size boundary
* for page size > block size.
*/
- ioffset = round_down(offset, PAGE_SIZE);
- /*
- * Write tail of the last page before removed range since it will get
- * removed from the page cache below.
- */
- ret = filemap_write_and_wait_range(mapping, ioffset, offset);
- if (ret)
- goto out_mmap;
- /*
- * Write data that will be shifted to preserve them when discarding
- * page cache below. We are also protected from pages becoming dirty
- * by i_rwsem and invalidate_lock.
- */
- ret = filemap_write_and_wait_range(mapping, offset + len,
- LLONG_MAX);
+ start = round_down(offset, PAGE_SIZE);
+ ret = filemap_write_and_wait_range(mapping, start, offset);
+ if (!ret)
+ ret = filemap_write_and_wait_range(mapping, end, LLONG_MAX);
if (ret)
- goto out_mmap;
- truncate_pagecache(inode, ioffset);
+ return ret;
+
+ truncate_pagecache(inode, start);
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_mmap;
- }
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
+ start_lblk = offset >> inode->i_blkbits;
+ end_lblk = (offset + len) >> inode->i_blkbits;
+
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode, 0);
- ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start);
+ ext4_discard_preallocations(inode);
+ ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk);
- ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
+ ret = ext4_ext_remove_space(inode, start_lblk, end_lblk - 1);
if (ret) {
up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
+ goto out_handle;
}
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
- ret = ext4_ext_shift_extents(inode, handle, punch_stop,
- punch_stop - punch_start, SHIFT_LEFT);
+ ret = ext4_ext_shift_extents(inode, handle, end_lblk,
+ end_lblk - start_lblk, SHIFT_LEFT);
if (ret) {
up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
+ goto out_handle;
}
new_size = inode->i_size - len;
@@ -5429,18 +5347,16 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
EXT4_I(inode)->i_disksize = new_size;
up_write(&EXT4_I(inode)->i_data_sem);
- if (IS_SYNC(inode))
- ext4_handle_sync(handle);
- inode->i_mtime = inode_set_ctime_current(inode);
ret = ext4_mark_inode_dirty(handle, inode);
+ if (ret)
+ goto out_handle;
+
ext4_update_inode_fsync_trans(handle, inode, 1);
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
-out_stop:
+out_handle:
ext4_journal_stop(handle);
-out_mmap:
- filemap_invalidate_unlock(mapping);
-out_mutex:
- inode_unlock(inode);
return ret;
}
@@ -5460,100 +5376,63 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
handle_t *handle;
struct ext4_ext_path *path;
struct ext4_extent *extent;
- ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
+ ext4_lblk_t start_lblk, len_lblk, ee_start_lblk = 0;
unsigned int credits, ee_len;
- int ret = 0, depth, split_flag = 0;
- loff_t ioffset;
+ int ret, depth, split_flag = 0;
+ loff_t start;
- /*
- * We need to test this early because xfstests assumes that an
- * insert range of (0, 1) will return EOPNOTSUPP if the file
- * system does not support insert range.
- */
+ trace_ext4_insert_range(inode, offset, len);
+ WARN_ON_ONCE(!inode_is_locked(inode));
+
+ /* Currently just for extent based files */
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return -EOPNOTSUPP;
-
/* Insert range works only on fs cluster size aligned regions. */
if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
return -EINVAL;
-
- trace_ext4_insert_range(inode, offset, len);
-
- offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
- len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
-
- inode_lock(inode);
- /* Currently just for extent based files */
- if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- ret = -EOPNOTSUPP;
- goto out_mutex;
- }
-
- /* Check whether the maximum file size would be exceeded */
- if (len > inode->i_sb->s_maxbytes - inode->i_size) {
- ret = -EFBIG;
- goto out_mutex;
- }
-
/* Offset must be less than i_size */
- if (offset >= inode->i_size) {
- ret = -EINVAL;
- goto out_mutex;
- }
-
- /* Wait for existing dio to complete */
- inode_dio_wait(inode);
-
- ret = file_modified(file);
- if (ret)
- goto out_mutex;
+ if (offset >= inode->i_size)
+ return -EINVAL;
+ /* Check whether the maximum file size would be exceeded */
+ if (len > inode->i_sb->s_maxbytes - inode->i_size)
+ return -EFBIG;
/*
- * Prevent page faults from reinstantiating pages we have released from
- * page cache.
+ * Write out all dirty pages. Need to round down to align start offset
+ * to page size boundary for page size > block size.
*/
- filemap_invalidate_lock(mapping);
-
- ret = ext4_break_layouts(inode);
+ start = round_down(offset, PAGE_SIZE);
+ ret = filemap_write_and_wait_range(mapping, start, LLONG_MAX);
if (ret)
- goto out_mmap;
+ return ret;
- /*
- * Need to round down to align start offset to page size boundary
- * for page size > block size.
- */
- ioffset = round_down(offset, PAGE_SIZE);
- /* Write out all dirty pages */
- ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
- LLONG_MAX);
- if (ret)
- goto out_mmap;
- truncate_pagecache(inode, ioffset);
+ truncate_pagecache(inode, start);
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_mmap;
- }
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
/* Expand file to avoid data loss if there is error while shifting */
inode->i_size += len;
EXT4_I(inode)->i_disksize += len;
- inode->i_mtime = inode_set_ctime_current(inode);
ret = ext4_mark_inode_dirty(handle, inode);
if (ret)
- goto out_stop;
+ goto out_handle;
+
+ start_lblk = offset >> inode->i_blkbits;
+ len_lblk = len >> inode->i_blkbits;
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
- path = ext4_find_extent(inode, offset_lblk, NULL, 0);
+ path = ext4_find_extent(inode, start_lblk, NULL, 0);
if (IS_ERR(path)) {
up_write(&EXT4_I(inode)->i_data_sem);
ret = PTR_ERR(path);
- goto out_stop;
+ goto out_handle;
}
depth = ext_depth(inode);
@@ -5563,51 +5442,47 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
ee_len = ext4_ext_get_actual_len(extent);
/*
- * If offset_lblk is not the starting block of extent, split
- * the extent @offset_lblk
+ * If start_lblk is not the starting block of extent, split
+ * the extent @start_lblk
*/
- if ((offset_lblk > ee_start_lblk) &&
- (offset_lblk < (ee_start_lblk + ee_len))) {
+ if ((start_lblk > ee_start_lblk) &&
+ (start_lblk < (ee_start_lblk + ee_len))) {
if (ext4_ext_is_unwritten(extent))
split_flag = EXT4_EXT_MARK_UNWRIT1 |
EXT4_EXT_MARK_UNWRIT2;
- ret = ext4_split_extent_at(handle, inode, &path,
- offset_lblk, split_flag,
+ path = ext4_split_extent_at(handle, inode, path,
+ start_lblk, split_flag,
EXT4_EX_NOCACHE |
EXT4_GET_BLOCKS_PRE_IO |
EXT4_GET_BLOCKS_METADATA_NOFAIL);
}
- ext4_free_ext_path(path);
- if (ret < 0) {
+ if (IS_ERR(path)) {
up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
+ ret = PTR_ERR(path);
+ goto out_handle;
}
- } else {
- ext4_free_ext_path(path);
}
- ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk);
+ ext4_free_ext_path(path);
+ ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk);
/*
- * if offset_lblk lies in a hole which is at start of file, use
+ * if start_lblk lies in a hole which is at start of file, use
* ee_start_lblk to shift extents
*/
ret = ext4_ext_shift_extents(inode, handle,
- max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT);
-
+ max(ee_start_lblk, start_lblk), len_lblk, SHIFT_RIGHT);
up_write(&EXT4_I(inode)->i_data_sem);
+ if (ret)
+ goto out_handle;
+
+ ext4_update_inode_fsync_trans(handle, inode, 1);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- if (ret >= 0)
- ext4_update_inode_fsync_trans(handle, inode, 1);
-out_stop:
+out_handle:
ext4_journal_stop(handle);
-out_mmap:
- filemap_invalidate_unlock(mapping);
-out_mutex:
- inode_unlock(inode);
return ret;
}
@@ -5654,25 +5529,21 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
int e1_len, e2_len, len;
int split = 0;
- path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
+ path1 = ext4_find_extent(inode1, lblk1, path1, EXT4_EX_NOCACHE);
if (IS_ERR(path1)) {
*erp = PTR_ERR(path1);
- path1 = NULL;
- finish:
- count = 0;
- goto repeat;
+ goto errout;
}
- path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
+ path2 = ext4_find_extent(inode2, lblk2, path2, EXT4_EX_NOCACHE);
if (IS_ERR(path2)) {
*erp = PTR_ERR(path2);
- path2 = NULL;
- goto finish;
+ goto errout;
}
ex1 = path1[path1->p_depth].p_ext;
ex2 = path2[path2->p_depth].p_ext;
/* Do we have something to swap ? */
if (unlikely(!ex2 || !ex1))
- goto finish;
+ goto errout;
e1_blk = le32_to_cpu(ex1->ee_block);
e2_blk = le32_to_cpu(ex2->ee_block);
@@ -5694,7 +5565,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
next2 = e2_blk;
/* Do we have something to swap */
if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
- goto finish;
+ goto errout;
/* Move to the rightest boundary */
len = next1 - lblk1;
if (len < next2 - lblk2)
@@ -5704,28 +5575,32 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
lblk1 += len;
lblk2 += len;
count -= len;
- goto repeat;
+ continue;
}
/* Prepare left boundary */
if (e1_blk < lblk1) {
split = 1;
- *erp = ext4_force_split_extent_at(handle, inode1,
- &path1, lblk1, 0);
- if (unlikely(*erp))
- goto finish;
+ path1 = ext4_force_split_extent_at(handle, inode1,
+ path1, lblk1, 0);
+ if (IS_ERR(path1)) {
+ *erp = PTR_ERR(path1);
+ goto errout;
+ }
}
if (e2_blk < lblk2) {
split = 1;
- *erp = ext4_force_split_extent_at(handle, inode2,
- &path2, lblk2, 0);
- if (unlikely(*erp))
- goto finish;
+ path2 = ext4_force_split_extent_at(handle, inode2,
+ path2, lblk2, 0);
+ if (IS_ERR(path2)) {
+ *erp = PTR_ERR(path2);
+ goto errout;
+ }
}
/* ext4_split_extent_at() may result in leaf extent split,
* path must to be revalidated. */
if (split)
- goto repeat;
+ continue;
/* Prepare right boundary */
len = count;
@@ -5736,30 +5611,34 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
if (len != e1_len) {
split = 1;
- *erp = ext4_force_split_extent_at(handle, inode1,
- &path1, lblk1 + len, 0);
- if (unlikely(*erp))
- goto finish;
+ path1 = ext4_force_split_extent_at(handle, inode1,
+ path1, lblk1 + len, 0);
+ if (IS_ERR(path1)) {
+ *erp = PTR_ERR(path1);
+ goto errout;
+ }
}
if (len != e2_len) {
split = 1;
- *erp = ext4_force_split_extent_at(handle, inode2,
- &path2, lblk2 + len, 0);
- if (*erp)
- goto finish;
+ path2 = ext4_force_split_extent_at(handle, inode2,
+ path2, lblk2 + len, 0);
+ if (IS_ERR(path2)) {
+ *erp = PTR_ERR(path2);
+ goto errout;
+ }
}
/* ext4_split_extent_at() may result in leaf extent split,
* path must to be revalidated. */
if (split)
- goto repeat;
+ continue;
BUG_ON(e2_len != e1_len);
*erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
if (unlikely(*erp))
- goto finish;
+ goto errout;
*erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
if (unlikely(*erp))
- goto finish;
+ goto errout;
/* Both extents are fully inside boundaries. Swap it now */
tmp_ex = *ex1;
@@ -5777,7 +5656,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
*erp = ext4_ext_dirty(handle, inode2, path2 +
path2->p_depth);
if (unlikely(*erp))
- goto finish;
+ goto errout;
*erp = ext4_ext_dirty(handle, inode1, path1 +
path1->p_depth);
/*
@@ -5787,17 +5666,17 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
* aborted anyway.
*/
if (unlikely(*erp))
- goto finish;
+ goto errout;
+
lblk1 += len;
lblk2 += len;
replaced_count += len;
count -= len;
-
- repeat:
- ext4_free_ext_path(path1);
- ext4_free_ext_path(path2);
- path1 = path2 = NULL;
}
+
+errout:
+ ext4_free_ext_path(path1);
+ ext4_free_ext_path(path2);
return replaced_count;
}
@@ -5832,11 +5711,8 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
/* search for the extent closest to the first block in the cluster */
path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- path = NULL;
- goto out;
- }
+ if (IS_ERR(path))
+ return PTR_ERR(path);
depth = ext_depth(inode);
@@ -5915,26 +5791,31 @@ int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
ext4_ext_get_actual_len(ex) != len) {
/* We need to split this extent to match our extent first */
down_write(&EXT4_I(inode)->i_data_sem);
- ret = ext4_force_split_extent_at(NULL, inode, &path, start, 1);
+ path = ext4_force_split_extent_at(NULL, inode, path, start, 1);
up_write(&EXT4_I(inode)->i_data_sem);
- if (ret)
+ if (IS_ERR(path)) {
+ ret = PTR_ERR(path);
goto out;
+ }
- path = ext4_find_extent(inode, start, &path, 0);
+ path = ext4_find_extent(inode, start, path, 0);
if (IS_ERR(path))
return PTR_ERR(path);
+
ex = path[path->p_depth].p_ext;
WARN_ON(le32_to_cpu(ex->ee_block) != start);
if (ext4_ext_get_actual_len(ex) != len) {
down_write(&EXT4_I(inode)->i_data_sem);
- ret = ext4_force_split_extent_at(NULL, inode, &path,
- start + len, 1);
+ path = ext4_force_split_extent_at(NULL, inode, path,
+ start + len, 1);
up_write(&EXT4_I(inode)->i_data_sem);
- if (ret)
+ if (IS_ERR(path)) {
+ ret = PTR_ERR(path);
goto out;
+ }
- path = ext4_find_extent(inode, start, &path, 0);
+ path = ext4_find_extent(inode, start, path, 0);
if (IS_ERR(path))
return PTR_ERR(path);
ex = path[path->p_depth].p_ext;
@@ -6018,12 +5899,9 @@ int ext4_ext_replay_set_iblocks(struct inode *inode)
if (IS_ERR(path))
return PTR_ERR(path);
ex = path[path->p_depth].p_ext;
- if (!ex) {
- ext4_free_ext_path(path);
+ if (!ex)
goto out;
- }
end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
- ext4_free_ext_path(path);
/* Count the number of data blocks */
cur = 0;
@@ -6049,32 +5927,28 @@ int ext4_ext_replay_set_iblocks(struct inode *inode)
ret = skip_hole(inode, &cur);
if (ret < 0)
goto out;
- path = ext4_find_extent(inode, cur, NULL, 0);
+ path = ext4_find_extent(inode, cur, path, 0);
if (IS_ERR(path))
goto out;
numblks += path->p_depth;
- ext4_free_ext_path(path);
while (cur < end) {
- path = ext4_find_extent(inode, cur, NULL, 0);
+ path = ext4_find_extent(inode, cur, path, 0);
if (IS_ERR(path))
break;
ex = path[path->p_depth].p_ext;
- if (!ex) {
- ext4_free_ext_path(path);
- return 0;
- }
+ if (!ex)
+ goto cleanup;
+
cur = max(cur + 1, le32_to_cpu(ex->ee_block) +
ext4_ext_get_actual_len(ex));
ret = skip_hole(inode, &cur);
- if (ret < 0) {
- ext4_free_ext_path(path);
+ if (ret < 0)
break;
- }
- path2 = ext4_find_extent(inode, cur, NULL, 0);
- if (IS_ERR(path2)) {
- ext4_free_ext_path(path);
+
+ path2 = ext4_find_extent(inode, cur, path2, 0);
+ if (IS_ERR(path2))
break;
- }
+
for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) {
cmp1 = cmp2 = 0;
if (i <= path->p_depth)
@@ -6086,13 +5960,14 @@ int ext4_ext_replay_set_iblocks(struct inode *inode)
if (cmp1 != cmp2 && cmp2 != 0)
numblks++;
}
- ext4_free_ext_path(path);
- ext4_free_ext_path(path2);
}
out:
inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9);
ext4_mark_inode_dirty(NULL, inode);
+cleanup:
+ ext4_free_ext_path(path);
+ ext4_free_ext_path(path2);
return 0;
}
@@ -6113,12 +5988,9 @@ int ext4_ext_clear_bb(struct inode *inode)
if (IS_ERR(path))
return PTR_ERR(path);
ex = path[path->p_depth].p_ext;
- if (!ex) {
- ext4_free_ext_path(path);
- return 0;
- }
+ if (!ex)
+ goto out;
end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
- ext4_free_ext_path(path);
cur = 0;
while (cur < end) {
@@ -6128,23 +6000,25 @@ int ext4_ext_clear_bb(struct inode *inode)
if (ret < 0)
break;
if (ret > 0) {
- path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
- if (!IS_ERR_OR_NULL(path)) {
+ path = ext4_find_extent(inode, map.m_lblk, path, 0);
+ if (!IS_ERR(path)) {
for (j = 0; j < path->p_depth; j++) {
-
ext4_mb_mark_bb(inode->i_sb,
- path[j].p_block, 1, 0);
+ path[j].p_block, 1, false);
ext4_fc_record_regions(inode->i_sb, inode->i_ino,
0, path[j].p_block, 1, 1);
}
- ext4_free_ext_path(path);
+ } else {
+ path = NULL;
}
- ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+ ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
ext4_fc_record_regions(inode->i_sb, inode->i_ino,
map.m_lblk, map.m_pblk, map.m_len, 1);
}
cur = cur + map.m_len;
}
+out:
+ ext4_free_ext_path(path);
return 0;
}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index d9d5cfb9c951..c786691dabd3 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -558,8 +558,8 @@ static int ext4_es_can_be_merged(struct extent_status *es1,
if (ext4_es_is_hole(es1))
return 1;
- /* we need to check delayed extent is without unwritten status */
- if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
+ /* we need to check delayed extent */
+ if (ext4_es_is_delayed(es1))
return 1;
return 0;
@@ -848,11 +848,12 @@ out:
*/
void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
- unsigned int status)
+ unsigned int status, int flags)
{
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
int err1 = 0, err2 = 0, err3 = 0;
+ int resv_used = 0, pending = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct extent_status *es1 = NULL;
struct extent_status *es2 = NULL;
@@ -862,21 +863,14 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
- es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
- lblk, len, pblk, status, inode->i_ino);
+ es_debug("add [%u/%u) %llu %x %x to extent status tree of inode %lu\n",
+ lblk, len, pblk, status, flags, inode->i_ino);
if (!len)
return;
BUG_ON(end < lblk);
-
- if ((status & EXTENT_STATUS_DELAYED) &&
- (status & EXTENT_STATUS_WRITTEN)) {
- ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
- " delayed and written which can potentially "
- " cause data loss.", lblk, len);
- WARN_ON(1);
- }
+ WARN_ON_ONCE(status & EXTENT_STATUS_DELAYED);
newes.es_lblk = lblk;
newes.es_len = len;
@@ -894,11 +888,11 @@ retry:
es1 = __es_alloc_extent(true);
if ((err1 || err2) && !es2)
es2 = __es_alloc_extent(true);
- if ((err1 || err2 || err3) && revise_pending && !pr)
+ if ((err1 || err2 || err3 < 0) && revise_pending && !pr)
pr = __alloc_pending(true);
write_lock(&EXT4_I(inode)->i_es_lock);
- err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
+ err1 = __es_remove_extent(inode, lblk, end, &resv_used, es1);
if (err1 != 0)
goto error;
/* Free preallocated extent if it didn't get used. */
@@ -922,16 +916,38 @@ retry:
if (revise_pending) {
err3 = __revise_pending(inode, lblk, len, &pr);
- if (err3 != 0)
+ if (err3 < 0)
goto error;
if (pr) {
__free_pending(pr);
pr = NULL;
}
+ pending = err3;
}
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
- if (err1 || err2 || err3)
+ /*
+ * Reduce the reserved cluster count to reflect successful deferred
+ * allocation of delayed allocated clusters or direct allocation of
+ * clusters discovered to be delayed allocated. Once allocated, a
+ * cluster is not included in the reserved count.
+ *
+ * When direct allocating (from fallocate, filemap, DIO, or clusters
+ * allocated when delalloc has been disabled by ext4_nonda_switch())
+ * an extent either 1) contains delayed blocks but start with
+ * non-delayed allocated blocks (e.g. hole) or 2) contains non-delayed
+ * allocated blocks which belong to delayed allocated clusters when
+ * bigalloc feature is enabled, quota has already been claimed by
+ * ext4_mb_new_blocks(), so release the quota reservations made for
+ * any previously delayed allocated clusters instead of claim them
+ * again.
+ */
+ resv_used += pending;
+ if (resv_used)
+ ext4_da_update_reserve_space(inode, resv_used,
+ flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+
+ if (err1 || err2 || err3 < 0)
goto retry;
ext4_es_print_tree(inode);
@@ -1051,7 +1067,7 @@ out:
}
struct rsvd_count {
- int ndelonly;
+ int ndelayed;
bool first_do_lblk_found;
ext4_lblk_t first_do_lblk;
ext4_lblk_t last_do_lblk;
@@ -1077,10 +1093,10 @@ static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct rb_node *node;
- rc->ndelonly = 0;
+ rc->ndelayed = 0;
/*
- * for bigalloc, note the first delonly block in the range has not
+ * for bigalloc, note the first delayed block in the range has not
* been found, record the extent containing the block to the left of
* the region to be removed, if any, and note that there's no partial
* cluster to track
@@ -1100,9 +1116,8 @@ static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
}
/*
- * count_rsvd - count the clusters containing delayed and not unwritten
- * (delonly) blocks in a range within an extent and add to
- * the running tally in rsvd_count
+ * count_rsvd - count the clusters containing delayed blocks in a range
+ * within an extent and add to the running tally in rsvd_count
*
* @inode - file containing extent
* @lblk - first block in range
@@ -1119,13 +1134,13 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_lblk_t i, end, nclu;
- if (!ext4_es_is_delonly(es))
+ if (!ext4_es_is_delayed(es))
return;
WARN_ON(len <= 0);
if (sbi->s_cluster_ratio == 1) {
- rc->ndelonly += (int) len;
+ rc->ndelayed += (int) len;
return;
}
@@ -1135,7 +1150,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
end = lblk + (ext4_lblk_t) len - 1;
end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;
- /* record the first block of the first delonly extent seen */
+ /* record the first block of the first delayed extent seen */
if (!rc->first_do_lblk_found) {
rc->first_do_lblk = i;
rc->first_do_lblk_found = true;
@@ -1149,7 +1164,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
* doesn't start with it, count it and stop tracking
*/
if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
- rc->ndelonly++;
+ rc->ndelayed++;
rc->partial = false;
}
@@ -1159,7 +1174,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
*/
if (EXT4_LBLK_COFF(sbi, i) != 0) {
if (end >= EXT4_LBLK_CFILL(sbi, i)) {
- rc->ndelonly++;
+ rc->ndelayed++;
rc->partial = false;
i = EXT4_LBLK_CFILL(sbi, i) + 1;
}
@@ -1167,11 +1182,11 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
/*
* if the current cluster starts on a cluster boundary, count the
- * number of whole delonly clusters in the extent
+ * number of whole delayed clusters in the extent
*/
if ((i + sbi->s_cluster_ratio - 1) <= end) {
nclu = (end - i + 1) >> sbi->s_cluster_bits;
- rc->ndelonly += nclu;
+ rc->ndelayed += nclu;
i += nclu << sbi->s_cluster_bits;
}
@@ -1231,10 +1246,9 @@ static struct pending_reservation *__pr_tree_search(struct rb_root *root,
* @rc - pointer to reserved count data
*
* The number of reservations to be released is equal to the number of
- * clusters containing delayed and not unwritten (delonly) blocks within
- * the range, minus the number of clusters still containing delonly blocks
- * at the ends of the range, and minus the number of pending reservations
- * within the range.
+ * clusters containing delayed blocks within the range, minus the number of
+ * clusters still containing delayed blocks at the ends of the range, and
+ * minus the number of pending reservations within the range.
*/
static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
struct extent_status *right_es,
@@ -1245,33 +1259,33 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
struct rb_node *node;
ext4_lblk_t first_lclu, last_lclu;
- bool left_delonly, right_delonly, count_pending;
+ bool left_delayed, right_delayed, count_pending;
struct extent_status *es;
if (sbi->s_cluster_ratio > 1) {
/* count any remaining partial cluster */
if (rc->partial)
- rc->ndelonly++;
+ rc->ndelayed++;
- if (rc->ndelonly == 0)
+ if (rc->ndelayed == 0)
return 0;
first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
last_lclu = EXT4_B2C(sbi, rc->last_do_lblk);
/*
- * decrease the delonly count by the number of clusters at the
- * ends of the range that still contain delonly blocks -
+ * decrease the delayed count by the number of clusters at the
+ * ends of the range that still contain delayed blocks -
* these clusters still need to be reserved
*/
- left_delonly = right_delonly = false;
+ left_delayed = right_delayed = false;
es = rc->left_es;
while (es && ext4_es_end(es) >=
EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
- if (ext4_es_is_delonly(es)) {
- rc->ndelonly--;
- left_delonly = true;
+ if (ext4_es_is_delayed(es)) {
+ rc->ndelayed--;
+ left_delayed = true;
break;
}
node = rb_prev(&es->rb_node);
@@ -1279,7 +1293,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
break;
es = rb_entry(node, struct extent_status, rb_node);
}
- if (right_es && (!left_delonly || first_lclu != last_lclu)) {
+ if (right_es && (!left_delayed || first_lclu != last_lclu)) {
if (end < ext4_es_end(right_es)) {
es = right_es;
} else {
@@ -1289,9 +1303,9 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
}
while (es && es->es_lblk <=
EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
- if (ext4_es_is_delonly(es)) {
- rc->ndelonly--;
- right_delonly = true;
+ if (ext4_es_is_delayed(es)) {
+ rc->ndelayed--;
+ right_delayed = true;
break;
}
node = rb_next(&es->rb_node);
@@ -1305,21 +1319,21 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
/*
* Determine the block range that should be searched for
* pending reservations, if any. Clusters on the ends of the
- * original removed range containing delonly blocks are
+ * original removed range containing delayed blocks are
* excluded. They've already been accounted for and it's not
* possible to determine if an associated pending reservation
* should be released with the information available in the
* extents status tree.
*/
if (first_lclu == last_lclu) {
- if (left_delonly | right_delonly)
+ if (left_delayed | right_delayed)
count_pending = false;
else
count_pending = true;
} else {
- if (left_delonly)
+ if (left_delayed)
first_lclu++;
- if (right_delonly)
+ if (right_delayed)
last_lclu--;
if (first_lclu <= last_lclu)
count_pending = true;
@@ -1330,13 +1344,13 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
/*
* a pending reservation found between first_lclu and last_lclu
* represents an allocated cluster that contained at least one
- * delonly block, so the delonly total must be reduced by one
+ * delayed block, so the delayed total must be reduced by one
* for each pending reservation found and released
*/
if (count_pending) {
pr = __pr_tree_search(&tree->root, first_lclu);
while (pr && pr->lclu <= last_lclu) {
- rc->ndelonly--;
+ rc->ndelayed--;
node = rb_next(&pr->rb_node);
rb_erase(&pr->rb_node, &tree->root);
__free_pending(pr);
@@ -1347,7 +1361,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
}
}
}
- return rc->ndelonly;
+ return rc->ndelayed;
}
@@ -1634,7 +1648,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
unsigned long nr;
struct ext4_sb_info *sbi;
- sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
+ sbi = shrink->private_data;
nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
return nr;
@@ -1643,8 +1657,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
static unsigned long ext4_es_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct ext4_sb_info *sbi = container_of(shrink,
- struct ext4_sb_info, s_es_shrinker);
+ struct ext4_sb_info *sbi = shrink->private_data;
int nr_to_scan = sc->nr_to_scan;
int ret, nr_shrunk;
@@ -1728,13 +1741,17 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
if (err)
goto err3;
- sbi->s_es_shrinker.scan_objects = ext4_es_scan;
- sbi->s_es_shrinker.count_objects = ext4_es_count;
- sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
- err = register_shrinker(&sbi->s_es_shrinker, "ext4-es:%s",
- sbi->s_sb->s_id);
- if (err)
+ sbi->s_es_shrinker = shrinker_alloc(0, "ext4-es:%s", sbi->s_sb->s_id);
+ if (!sbi->s_es_shrinker) {
+ err = -ENOMEM;
goto err4;
+ }
+
+ sbi->s_es_shrinker->scan_objects = ext4_es_scan;
+ sbi->s_es_shrinker->count_objects = ext4_es_count;
+ sbi->s_es_shrinker->private_data = sbi;
+
+ shrinker_register(sbi->s_es_shrinker);
return 0;
err4:
@@ -1754,7 +1771,7 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
- unregister_shrinker(&sbi->s_es_shrinker);
+ shrinker_free(sbi->s_es_shrinker);
}
/*
@@ -1937,7 +1954,7 @@ static struct pending_reservation *__get_pending(struct inode *inode,
* @lblk - logical block in the cluster to be added
* @prealloc - preallocated pending entry
*
- * Returns 0 on successful insertion and -ENOMEM on failure. If the
+ * Returns 1 on successful insertion and -ENOMEM on failure. If the
* pending reservation is already in the set, returns successfully.
*/
static int __insert_pending(struct inode *inode, ext4_lblk_t lblk,
@@ -1981,6 +1998,7 @@ static int __insert_pending(struct inode *inode, ext4_lblk_t lblk,
rb_link_node(&pr->rb_node, parent, p);
rb_insert_color(&pr->rb_node, &tree->root);
+ ret = 1;
out:
return ret;
@@ -2051,34 +2069,49 @@ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
}
/*
- * ext4_es_insert_delayed_block - adds a delayed block to the extents status
- * tree, adding a pending reservation where
- * needed
+ * ext4_es_insert_delayed_extent - adds some delayed blocks to the extents
+ * status tree, adding a pending reservation
+ * where needed
*
* @inode - file containing the newly added block
- * @lblk - logical block to be added
- * @allocated - indicates whether a physical cluster has been allocated for
- * the logical cluster that contains the block
+ * @lblk - start logical block to be added
+ * @len - length of blocks to be added
+ * @lclu_allocated/end_allocated - indicates whether a physical cluster has
+ * been allocated for the logical cluster
+ * that contains the start/end block. Note that
+ * end_allocated should always be set to false
+ * if the start and the end block are in the
+ * same cluster
*/
-void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
- bool allocated)
+void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, bool lclu_allocated,
+ bool end_allocated)
{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct extent_status newes;
+ ext4_lblk_t end = lblk + len - 1;
int err1 = 0, err2 = 0, err3 = 0;
struct extent_status *es1 = NULL;
struct extent_status *es2 = NULL;
- struct pending_reservation *pr = NULL;
+ struct pending_reservation *pr1 = NULL;
+ struct pending_reservation *pr2 = NULL;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
- es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
- lblk, inode->i_ino);
+ es_debug("add [%u/%u) delayed to extent status tree of inode %lu\n",
+ lblk, len, inode->i_ino);
+ if (!len)
+ return;
+
+ WARN_ON_ONCE((EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) &&
+ end_allocated);
newes.es_lblk = lblk;
- newes.es_len = 1;
+ newes.es_len = len;
ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
- trace_ext4_es_insert_delayed_block(inode, &newes, allocated);
+ trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated,
+ end_allocated);
ext4_es_insert_extent_check(inode, &newes);
@@ -2087,11 +2120,15 @@ retry:
es1 = __es_alloc_extent(true);
if ((err1 || err2) && !es2)
es2 = __es_alloc_extent(true);
- if ((err1 || err2 || err3) && allocated && !pr)
- pr = __alloc_pending(true);
+ if (err1 || err2 || err3 < 0) {
+ if (lclu_allocated && !pr1)
+ pr1 = __alloc_pending(true);
+ if (end_allocated && !pr2)
+ pr2 = __alloc_pending(true);
+ }
write_lock(&EXT4_I(inode)->i_es_lock);
- err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1);
+ err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
if (err1 != 0)
goto error;
/* Free preallocated extent if it didn't get used. */
@@ -2111,18 +2148,27 @@ retry:
es2 = NULL;
}
- if (allocated) {
- err3 = __insert_pending(inode, lblk, &pr);
- if (err3 != 0)
+ if (lclu_allocated) {
+ err3 = __insert_pending(inode, lblk, &pr1);
+ if (err3 < 0)
goto error;
- if (pr) {
- __free_pending(pr);
- pr = NULL;
+ if (pr1) {
+ __free_pending(pr1);
+ pr1 = NULL;
+ }
+ }
+ if (end_allocated) {
+ err3 = __insert_pending(inode, end, &pr2);
+ if (err3 < 0)
+ goto error;
+ if (pr2) {
+ __free_pending(pr2);
+ pr2 = NULL;
}
}
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
- if (err1 || err2 || err3)
+ if (err1 || err2 || err3 < 0)
goto retry;
ext4_es_print_tree(inode);
@@ -2131,94 +2177,6 @@ error:
}
/*
- * __es_delayed_clu - count number of clusters containing blocks that
- * are delayed only
- *
- * @inode - file containing block range
- * @start - logical block defining start of range
- * @end - logical block defining end of range
- *
- * Returns the number of clusters containing only delayed (not delayed
- * and unwritten) blocks in the range specified by @start and @end. Any
- * cluster or part of a cluster within the range and containing a delayed
- * and not unwritten block within the range is counted as a whole cluster.
- */
-static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start,
- ext4_lblk_t end)
-{
- struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
- struct extent_status *es;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- struct rb_node *node;
- ext4_lblk_t first_lclu, last_lclu;
- unsigned long long last_counted_lclu;
- unsigned int n = 0;
-
- /* guaranteed to be unequal to any ext4_lblk_t value */
- last_counted_lclu = ~0ULL;
-
- es = __es_tree_search(&tree->root, start);
-
- while (es && (es->es_lblk <= end)) {
- if (ext4_es_is_delonly(es)) {
- if (es->es_lblk <= start)
- first_lclu = EXT4_B2C(sbi, start);
- else
- first_lclu = EXT4_B2C(sbi, es->es_lblk);
-
- if (ext4_es_end(es) >= end)
- last_lclu = EXT4_B2C(sbi, end);
- else
- last_lclu = EXT4_B2C(sbi, ext4_es_end(es));
-
- if (first_lclu == last_counted_lclu)
- n += last_lclu - first_lclu;
- else
- n += last_lclu - first_lclu + 1;
- last_counted_lclu = last_lclu;
- }
- node = rb_next(&es->rb_node);
- if (!node)
- break;
- es = rb_entry(node, struct extent_status, rb_node);
- }
-
- return n;
-}
-
-/*
- * ext4_es_delayed_clu - count number of clusters containing blocks that
- * are both delayed and unwritten
- *
- * @inode - file containing block range
- * @lblk - logical block defining start of range
- * @len - number of blocks in range
- *
- * Locking for external use of __es_delayed_clu().
- */
-unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
- ext4_lblk_t end;
- unsigned int n;
-
- if (len == 0)
- return 0;
-
- end = lblk + len - 1;
- WARN_ON(end < lblk);
-
- read_lock(&ei->i_es_lock);
-
- n = __es_delayed_clu(inode, lblk, end);
-
- read_unlock(&ei->i_es_lock);
-
- return n;
-}
-
-/*
* __revise_pending - makes, cancels, or leaves unchanged pending cluster
* reservations for a specified block range depending
* upon the presence or absence of delayed blocks
@@ -2232,7 +2190,9 @@ unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
*
* Used after a newly allocated extent is added to the extents status tree.
* Requires that the extents in the range have either written or unwritten
- * status. Must be called while holding i_es_lock.
+ * status. Must be called while holding i_es_lock. Returns number of new
+ * inserts pending cluster on insert pendings, returns 0 on remove pendings,
+ * return -ENOMEM on failure.
*/
static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len,
@@ -2242,6 +2202,7 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t end = lblk + len - 1;
ext4_lblk_t first, last;
bool f_del = false, l_del = false;
+ int pendings = 0;
int ret = 0;
if (len == 0)
@@ -2263,49 +2224,53 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
first = EXT4_LBLK_CMASK(sbi, lblk);
if (first != lblk)
- f_del = __es_scan_range(inode, &ext4_es_is_delonly,
+ f_del = __es_scan_range(inode, &ext4_es_is_delayed,
first, lblk - 1);
if (f_del) {
ret = __insert_pending(inode, first, prealloc);
if (ret < 0)
goto out;
+ pendings += ret;
} else {
last = EXT4_LBLK_CMASK(sbi, end) +
sbi->s_cluster_ratio - 1;
if (last != end)
l_del = __es_scan_range(inode,
- &ext4_es_is_delonly,
+ &ext4_es_is_delayed,
end + 1, last);
if (l_del) {
ret = __insert_pending(inode, last, prealloc);
if (ret < 0)
goto out;
+ pendings += ret;
} else
__remove_pending(inode, last);
}
} else {
first = EXT4_LBLK_CMASK(sbi, lblk);
if (first != lblk)
- f_del = __es_scan_range(inode, &ext4_es_is_delonly,
+ f_del = __es_scan_range(inode, &ext4_es_is_delayed,
first, lblk - 1);
if (f_del) {
ret = __insert_pending(inode, first, prealloc);
if (ret < 0)
goto out;
+ pendings += ret;
} else
__remove_pending(inode, first);
last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
if (last != end)
- l_del = __es_scan_range(inode, &ext4_es_is_delonly,
+ l_del = __es_scan_range(inode, &ext4_es_is_delayed,
end + 1, last);
if (l_del) {
ret = __insert_pending(inode, last, prealloc);
if (ret < 0)
goto out;
+ pendings += ret;
} else
__remove_pending(inode, last);
}
out:
- return ret;
+ return (ret < 0) ? ret : pendings;
}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index d9847a4a25db..4424232de298 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -42,6 +42,10 @@ enum {
#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS)
#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT)
+/*
+ * Besides EXTENT_STATUS_REFERENCED, all these extent type masks
+ * are exclusive, only one type can be set at a time.
+ */
#define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B)
#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B)
#define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B)
@@ -51,7 +55,9 @@ enum {
#define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \
EXTENT_STATUS_UNWRITTEN | \
EXTENT_STATUS_DELAYED | \
- EXTENT_STATUS_HOLE) << ES_SHIFT)
+ EXTENT_STATUS_HOLE))
+
+#define ES_TYPE_VALID(type) ((type) && !((type) & ((type) - 1)))
struct ext4_sb_info;
struct ext4_extent;
@@ -129,7 +135,7 @@ extern void ext4_es_init_tree(struct ext4_es_tree *tree);
extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
- unsigned int status);
+ unsigned int status, int flags);
extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
unsigned int status);
@@ -156,7 +162,7 @@ static inline unsigned int ext4_es_status(struct extent_status *es)
static inline unsigned int ext4_es_type(struct extent_status *es)
{
- return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT;
+ return (es->es_pblk >> ES_SHIFT) & ES_TYPE_MASK;
}
static inline int ext4_es_is_written(struct extent_status *es)
@@ -184,11 +190,6 @@ static inline int ext4_es_is_mapped(struct extent_status *es)
return (ext4_es_is_written(es) || ext4_es_is_unwritten(es));
}
-static inline int ext4_es_is_delonly(struct extent_status *es)
-{
- return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es));
-}
-
static inline void ext4_es_set_referenced(struct extent_status *es)
{
es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
@@ -224,17 +225,12 @@ static inline void ext4_es_store_pblock(struct extent_status *es,
es->es_pblk = block;
}
-static inline void ext4_es_store_status(struct extent_status *es,
- unsigned int status)
-{
- es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
- (es->es_pblk & ~ES_MASK);
-}
-
static inline void ext4_es_store_pblock_status(struct extent_status *es,
ext4_fsblk_t pb,
unsigned int status)
{
+ WARN_ON_ONCE(!ES_TYPE_VALID(status & ES_TYPE_MASK));
+
es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
(pb & ~ES_MASK);
}
@@ -249,10 +245,9 @@ extern void ext4_exit_pending(void);
extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
-extern void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
- bool allocated);
-extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len);
+extern void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, bool lclu_allocated,
+ bool end_allocated);
extern void ext4_clear_inode_es(struct inode *inode);
#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index b527f4ab47e0..b33664f6ce2a 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -1795,7 +1795,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
if (ret == 0) {
/* Range is not mapped */
- path = ext4_find_extent(inode, cur, NULL, 0);
+ path = ext4_find_extent(inode, cur, path, 0);
if (IS_ERR(path))
goto out;
memset(&newex, 0, sizeof(newex));
@@ -1806,11 +1806,10 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
if (ext4_ext_is_unwritten(ex))
ext4_ext_mark_unwritten(&newex);
down_write(&EXT4_I(inode)->i_data_sem);
- ret = ext4_ext_insert_extent(
- NULL, inode, &path, &newex, 0);
+ path = ext4_ext_insert_extent(NULL, inode,
+ path, &newex, 0);
up_write((&EXT4_I(inode)->i_data_sem));
- ext4_free_ext_path(path);
- if (ret)
+ if (IS_ERR(path))
goto out;
goto next;
}
@@ -1835,7 +1834,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
* at the end of the FC replay using our array of
* modified inodes.
*/
- ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+ ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
goto next;
}
@@ -1859,6 +1858,7 @@ next:
ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
sb->s_blocksize_bits);
out:
+ ext4_free_ext_path(path);
iput(inode);
return 0;
}
@@ -1904,7 +1904,7 @@ ext4_fc_replay_del_range(struct super_block *sb,
if (ret > 0) {
remaining -= ret;
cur += ret;
- ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+ ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
} else {
remaining -= map.m_len;
cur += map.m_len;
@@ -1959,22 +1959,25 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
break;
if (ret > 0) {
- path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
+ path = ext4_find_extent(inode, map.m_lblk, path, 0);
if (!IS_ERR(path)) {
for (j = 0; j < path->p_depth; j++)
ext4_mb_mark_bb(inode->i_sb,
- path[j].p_block, 1, 1);
- ext4_free_ext_path(path);
+ path[j].p_block, 1, true);
+ } else {
+ path = NULL;
}
cur += ret;
ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
- map.m_len, 1);
+ map.m_len, true);
} else {
cur = cur + (map.m_len ? map.m_len : 1);
}
}
iput(inode);
}
+
+ ext4_free_ext_path(path);
}
/*
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index c71af675e310..6c692151b0d6 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -174,7 +174,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
(atomic_read(&inode->i_writecount) == 1) &&
!EXT4_I(inode)->i_reserved_data_blocks) {
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
up_write(&EXT4_I(inode)->i_data_sem);
}
if (is_dx(inode) && filp->private_data)
@@ -306,7 +306,7 @@ out:
}
static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
- ssize_t count)
+ ssize_t written, ssize_t count)
{
handle_t *handle;
@@ -315,7 +315,7 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
if (IS_ERR(handle))
return PTR_ERR(handle);
- if (ext4_update_inode_size(inode, offset + count)) {
+ if (ext4_update_inode_size(inode, offset + written)) {
int ret = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret)) {
ext4_journal_stop(handle);
@@ -323,11 +323,11 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
}
}
- if (inode->i_nlink)
+ if ((written == count) && inode->i_nlink)
ext4_orphan_del(handle, inode);
ext4_journal_stop(handle);
- return count;
+ return written;
}
/*
@@ -393,7 +393,7 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) &&
pos + size <= i_size_read(inode))
return size;
- return ext4_handle_inode_extension(inode, pos, size);
+ return ext4_handle_inode_extension(inode, pos, size, size);
}
static const struct iomap_dio_ops ext4_dio_write_ops = {
@@ -669,7 +669,7 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
if (extend) {
- ret = ext4_handle_inode_extension(inode, offset, ret);
+ ret = ext4_handle_inode_extension(inode, offset, ret, count);
ext4_inode_extension_cleanup(inode, ret < (ssize_t)count);
}
out:
@@ -844,8 +844,7 @@ static int ext4_sample_last_mounted(struct super_block *sb,
if (err)
goto out_journal;
lock_buffer(sbi->s_sbh);
- strncpy(sbi->s_es->s_last_mounted, cp,
- sizeof(sbi->s_es->s_last_mounted));
+ strtomem_pad(sbi->s_es->s_last_mounted, cp, 0);
ext4_superblock_csum_set(sb);
unlock_buffer(sbi->s_sbh);
ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
@@ -885,8 +884,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
return ret;
}
- filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC |
- FMODE_DIO_PARALLEL_WRITE;
+ filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
return dquot_file_open(inode, filp);
}
@@ -898,12 +896,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
- loff_t maxbytes;
-
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
- else
- maxbytes = inode->i_sb->s_maxbytes;
+ loff_t maxbytes = ext4_get_maxbytes(inode);
switch (whence) {
default:
@@ -938,7 +931,6 @@ const struct file_operations ext4_file_operations = {
.compat_ioctl = ext4_compat_ioctl,
#endif
.mmap = ext4_file_mmap,
- .mmap_supported_flags = MAP_SYNC,
.open = ext4_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
@@ -946,6 +938,8 @@ const struct file_operations ext4_file_operations = {
.splice_read = ext4_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ext4_fallocate,
+ .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
+ FOP_DIO_PARALLEL_WRITE,
};
const struct inode_operations ext4_file_inode_operations = {
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index 53a05b8292f0..91185c40f755 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -393,6 +393,14 @@ static unsigned int ext4_getfsmap_find_sb(struct super_block *sb,
/* Reserved GDT blocks */
if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) {
len = le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+
+ /*
+ * mkfs.ext4 can set s_reserved_gdt_blocks as 0 in some cases,
+ * check for that.
+ */
+ if (!len)
+ return 0;
+
error = ext4_getfsmap_fill(meta_list, fsb, len,
EXT4_FMR_OWN_RESV_GDT);
if (error)
@@ -526,6 +534,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
ext4_group_t end_ag;
ext4_grpblk_t first_cluster;
ext4_grpblk_t last_cluster;
+ struct ext4_fsmap irec;
int error = 0;
bofs = le32_to_cpu(sbi->s_es->s_first_data_block);
@@ -609,10 +618,18 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
goto err;
}
- /* Report any gaps at the end of the bg */
+ /*
+ * The dummy record below will cause ext4_getfsmap_helper() to report
+ * any allocated blocks at the end of the range.
+ */
+ irec.fmr_device = 0;
+ irec.fmr_physical = end_fsb + 1;
+ irec.fmr_length = 0;
+ irec.fmr_owner = EXT4_FMR_OWN_FREE;
+ irec.fmr_flags = 0;
+
info->gfi_last = true;
- error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster + 1,
- 0, info);
+ error = ext4_getfsmap_helper(sb, info, &irec);
if (error)
goto err;
@@ -628,8 +645,9 @@ static bool ext4_getfsmap_is_valid_device(struct super_block *sb,
if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev))
return true;
- if (EXT4_SB(sb)->s_journal_bdev &&
- fm->fmr_device == new_encode_dev(EXT4_SB(sb)->s_journal_bdev->bd_dev))
+ if (EXT4_SB(sb)->s_journal_bdev_file &&
+ fm->fmr_device ==
+ new_encode_dev(file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev))
return true;
return false;
}
@@ -699,9 +717,9 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head,
memset(handlers, 0, sizeof(handlers));
handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev);
handlers[0].gfd_fn = ext4_getfsmap_datadev;
- if (EXT4_SB(sb)->s_journal_bdev) {
+ if (EXT4_SB(sb)->s_journal_bdev_file) {
handlers[1].gfd_dev = new_encode_dev(
- EXT4_SB(sb)->s_journal_bdev->bd_dev);
+ file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev);
handlers[1].gfd_fn = ext4_getfsmap_logdev;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index d4d0ad689d3c..21d228073d79 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -87,10 +87,10 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
return 0;
- grp = ext4_get_group_info(sb, block_group);
-
if (buffer_verified(bh))
return 0;
+
+ grp = ext4_get_group_info(sb, block_group);
if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
return -EFSCORRUPTED;
@@ -98,8 +98,7 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
if (buffer_verified(bh))
goto verified;
blk = ext4_inode_bitmap(sb, desc);
- if (!ext4_inode_bitmap_csum_verify(sb, desc, bh,
- EXT4_INODES_PER_GROUP(sb) / 8) ||
+ if (!ext4_inode_bitmap_csum_verify(sb, desc, bh) ||
ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) {
ext4_unlock_group(sb, block_group);
ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
@@ -328,8 +327,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
if (percpu_counter_initialized(&sbi->s_dirs_counter))
percpu_counter_dec(&sbi->s_dirs_counter);
}
- ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh,
- EXT4_INODES_PER_GROUP(sb) / 8);
+ ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh);
ext4_group_desc_csum_set(sb, block_group, gdp);
ext4_unlock_group(sb, block_group);
@@ -775,7 +773,7 @@ int ext4_mark_inode_used(struct super_block *sb, int ino)
}
gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
- if (!gdp || !group_desc_bh) {
+ if (!gdp) {
err = -EINVAL;
goto out;
}
@@ -854,8 +852,7 @@ int ext4_mark_inode_used(struct super_block *sb, int ino)
ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
if (ext4_has_group_desc_csum(sb)) {
- ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh,
- EXT4_INODES_PER_GROUP(sb) / 8);
+ ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh);
ext4_group_desc_csum_set(sb, group, gdp);
}
@@ -1065,7 +1062,6 @@ got_group:
EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
goto next_group;
-repeat_in_this_group:
ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
if (!ret2)
goto next_group;
@@ -1115,8 +1111,6 @@ repeat_in_this_group:
if (!ret2)
goto got; /* we grabbed the inode! */
- if (ino < EXT4_INODES_PER_GROUP(sb))
- goto repeat_in_this_group;
next_group:
if (++group == ngroups)
group = 0;
@@ -1229,8 +1223,7 @@ got:
}
}
if (ext4_has_group_desc_csum(sb)) {
- ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh,
- EXT4_INODES_PER_GROUP(sb) / 8);
+ ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh);
ext4_group_desc_csum_set(sb, group, gdp);
}
ext4_unlock_group(sb, group);
@@ -1255,8 +1248,8 @@ got:
inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
/* This is the optimal IO size (for stat), not the fs block size */
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
- ei->i_crtime = inode->i_mtime;
+ simple_inode_init_ts(inode);
+ ei->i_crtime = inode_get_mtime(inode);
memset(ei->i_data, 0, sizeof(ei->i_data));
ei->i_dir_start_lookup = 0;
@@ -1341,10 +1334,7 @@ got:
}
}
- if (ext4_handle_valid(handle)) {
- ei->i_sync_tid = handle->h_transaction->t_tid;
- ei->i_datasync_tid = handle->h_transaction->t_tid;
- }
+ ext4_update_inode_fsync_trans(handle, inode, 1);
err = ext4_mark_inode_dirty(handle, inode);
if (err) {
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index f2c495b745f1..d45124318200 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -539,7 +539,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
int indirect_blks;
int blocks_to_boundary = 0;
int depth;
- int count = 0;
+ u64 count = 0;
ext4_fsblk_t first_block = 0;
trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
@@ -588,7 +588,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
count++;
/* Fill in size of a hole we found */
map->m_pblk = 0;
- map->m_len = min_t(unsigned int, map->m_len, count);
+ map->m_len = umin(map->m_len, count);
goto cleanup;
}
@@ -652,13 +652,6 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
ext4_update_inode_fsync_trans(handle, inode, 1);
count = ar.len;
- /*
- * Update reserved blocks/metadata blocks after successful block
- * allocation which had been deferred till now.
- */
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
- ext4_da_update_reserve_space(inode, count, 1);
-
got_it:
map->m_flags |= EXT4_MAP_MAPPED;
map->m_pblk = le32_to_cpu(chain[depth-1].key);
@@ -714,7 +707,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
* i_rwsem. So we can safely drop the i_data_sem here.
*/
BUG_ON(EXT4_JOURNAL(inode) == NULL);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
up_write(&EXT4_I(inode)->i_data_sem);
*dropped = 1;
return 0;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index cb65052ee3de..9fb5e0f172a7 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -298,7 +298,11 @@ static int ext4_create_inline_data(handle_t *handle,
if (error)
goto out;
- BUG_ON(!is.s.not_found);
+ if (!is.s.not_found) {
+ EXT4_ERROR_INODE(inode, "unexpected inline data xattr");
+ error = -EFSCORRUPTED;
+ goto out;
+ }
error = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (error) {
@@ -349,7 +353,11 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
if (error)
goto out;
- BUG_ON(is.s.not_found);
+ if (is.s.not_found) {
+ EXT4_ERROR_INODE(inode, "missing inline data xattr");
+ error = -EFSCORRUPTED;
+ goto out;
+ }
len -= EXT4_MIN_INLINE_DATA_SIZE;
value = kzalloc(len, GFP_NOFS);
@@ -392,7 +400,7 @@ out:
}
static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
- unsigned int len)
+ loff_t len)
{
int ret, size, no_expand;
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -502,9 +510,8 @@ static int ext4_read_inline_folio(struct inode *inode, struct folio *folio)
BUG_ON(len > PAGE_SIZE);
kaddr = kmap_local_folio(folio, 0);
ret = ext4_read_inline_data(inode, kaddr, len, &iloc);
- flush_dcache_folio(folio);
+ kaddr = folio_zero_tail(folio, len, kaddr + len);
kunmap_local(kaddr);
- folio_zero_segment(folio, len, folio_size(folio));
folio_mark_uptodate(folio);
brelse(iloc.bh);
@@ -602,10 +609,12 @@ retry:
goto out;
if (ext4_should_dioread_nolock(inode)) {
- ret = __block_write_begin(&folio->page, from, to,
- ext4_get_block_unwritten);
+ ret = ext4_block_write_begin(handle, folio, from, to,
+ ext4_get_block_unwritten);
} else
- ret = __block_write_begin(&folio->page, from, to, ext4_get_block);
+ ret = ext4_block_write_begin(handle, folio, from, to,
+ ext4_get_block);
+ clear_buffer_new(folio_buffers(folio));
if (!ret && ext4_should_journal_data(inode)) {
ret = ext4_walk_page_buffers(handle, inode,
@@ -661,7 +670,7 @@ out_nofolio:
int ext4_try_to_write_inline_data(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
- struct page **pagep)
+ struct folio **foliop)
{
int ret;
handle_t *handle;
@@ -709,7 +718,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
goto out;
}
- *pagep = &folio->page;
+ *foliop = folio;
down_read(&EXT4_I(inode)->xattr_sem);
if (!ext4_has_inline_data(inode)) {
ret = 0;
@@ -857,8 +866,8 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
goto out;
}
- ret = __block_write_begin(&folio->page, 0, inline_size,
- ext4_da_get_block_prep);
+ ret = ext4_block_write_begin(NULL, folio, 0, inline_size,
+ ext4_da_get_block_prep);
if (ret) {
up_read(&EXT4_I(inode)->xattr_sem);
folio_unlock(folio);
@@ -867,6 +876,7 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
return ret;
}
+ clear_buffer_new(folio_buffers(folio));
folio_mark_dirty(folio);
folio_mark_uptodate(folio);
ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
@@ -892,7 +902,7 @@ out:
int ext4_da_write_inline_data_begin(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
- struct page **pagep,
+ struct folio **foliop,
void **fsdata)
{
int ret;
@@ -955,7 +965,7 @@ retry_journal:
goto out_release_page;
up_read(&EXT4_I(inode)->xattr_sem);
- *pagep = &folio->page;
+ *foliop = folio;
brelse(iloc.bh);
return 1;
out_release_page:
@@ -1037,7 +1047,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
* happen is that the times are slightly out of date
* and/or different from the directory change time.
*/
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
ext4_update_dx_flag(dir);
inode_inc_iversion(dir);
return 1;
@@ -1461,6 +1471,7 @@ int ext4_read_inline_dir(struct file *file,
struct ext4_iloc iloc;
void *dir_buf = NULL;
int dotdot_offset, dotdot_size, extra_offset, extra_size;
+ struct dir_private_info *info = file->private_data;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
@@ -1504,12 +1515,12 @@ int ext4_read_inline_dir(struct file *file,
extra_size = extra_offset + inline_size;
/*
- * If the version has changed since the last call to
+ * If the cookie has changed since the last call to
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the inline
* dir to make sure.
*/
- if (!inode_eq_iversion(inode, file->f_version)) {
+ if (!inode_eq_iversion(inode, info->cookie)) {
for (i = 0; i < extra_size && i < offset;) {
/*
* "." is with offset 0 and
@@ -1541,7 +1552,7 @@ int ext4_read_inline_dir(struct file *file,
}
offset = i;
ctx->pos = offset;
- file->f_version = inode_query_iversion(inode);
+ info->cookie = inode_query_iversion(inode);
}
while (ctx->pos < extra_size) {
@@ -1966,7 +1977,12 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0)
goto out_error;
- BUG_ON(is.s.not_found);
+ if (is.s.not_found) {
+ EXT4_ERROR_INODE(inode,
+ "missing inline data xattr");
+ err = -EFSCORRUPTED;
+ goto out_error;
+ }
value_len = le32_to_cpu(is.s.here->e_value_size);
value = kmalloc(value_len, GFP_NOFS);
@@ -2010,7 +2026,7 @@ out:
ext4_orphan_del(handle, inode);
if (err == 0) {
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
err = ext4_mark_inode_dirty(handle, inode);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c
index f0c0fd507fbc..749af7ad4e09 100644
--- a/fs/ext4/inode-test.c
+++ b/fs/ext4/inode-test.c
@@ -279,4 +279,5 @@ static struct kunit_suite ext4_inode_test_suite = {
kunit_test_suites(&ext4_inode_test_suite);
+MODULE_DESCRIPTION("KUnit test of ext4 inode timestamp decoding");
MODULE_LICENSE("GPL v2");
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 18ec9106c5b0..7923602271ad 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -31,6 +31,7 @@
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/mpage.h>
+#include <linux/rmap.h>
#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
@@ -49,6 +50,11 @@
#include <trace/events/ext4.h>
+static void ext4_journalled_zero_new_buffers(handle_t *handle,
+ struct inode *inode,
+ struct folio *folio,
+ unsigned from, unsigned to);
+
static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
struct ext4_inode_info *ei)
{
@@ -145,7 +151,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
*/
int ext4_inode_is_fast_symlink(struct inode *inode)
{
- if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+ if (!ext4_has_feature_ea_inode(inode->i_sb)) {
int ea_blocks = EXT4_I(inode)->i_file_acl ?
EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
@@ -371,17 +377,18 @@ void ext4_da_update_reserve_space(struct inode *inode,
*/
if ((ei->i_reserved_data_blocks == 0) &&
!inode_is_open_for_write(inode))
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
}
static int __check_block_validity(struct inode *inode, const char *func,
unsigned int line,
struct ext4_map_blocks *map)
{
- if (ext4_has_feature_journal(inode->i_sb) &&
- (inode->i_ino ==
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+
+ if (journal && inode == journal->j_inode)
return 0;
+
if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
ext4_error_inode(inode, func, line, map->m_pblk,
"lblock %lu mapped to illegal pblock %llu "
@@ -478,7 +485,89 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status);
+ map->m_pblk, status, 0);
+ return retval;
+}
+
+static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags)
+{
+ struct extent_status es;
+ unsigned int status;
+ int err, retval = 0;
+
+ /*
+ * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE
+ * indicates that the blocks and quotas has already been
+ * checked when the data was copied into the page cache.
+ */
+ if (map->m_flags & EXT4_MAP_DELAYED)
+ flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+
+ /*
+ * Here we clear m_flags because after allocating an new extent,
+ * it will be set again.
+ */
+ map->m_flags &= ~EXT4_MAP_FLAGS;
+
+ /*
+ * We need to check for EXT4 here because migrate could have
+ * changed the inode type in between.
+ */
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ retval = ext4_ext_map_blocks(handle, inode, map, flags);
+ } else {
+ retval = ext4_ind_map_blocks(handle, inode, map, flags);
+
+ /*
+ * We allocated new blocks which will result in i_data's
+ * format changing. Force the migrate to fail by clearing
+ * migrate flags.
+ */
+ if (retval > 0 && map->m_flags & EXT4_MAP_NEW)
+ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+ }
+ if (retval <= 0)
+ return retval;
+
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode %lu: "
+ "retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
+ }
+
+ /*
+ * We have to zeroout blocks before inserting them into extent
+ * status tree. Otherwise someone could look them up there and
+ * use them before they are really zeroed. We also have to
+ * unmap metadata before zeroing as otherwise writeback can
+ * overwrite zeros with stale data from block device.
+ */
+ if (flags & EXT4_GET_BLOCKS_ZERO &&
+ map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) {
+ err = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk,
+ map->m_len);
+ if (err)
+ return err;
+ }
+
+ /*
+ * If the extent has been zeroed out, we don't need to update
+ * extent status tree.
+ */
+ if (flags & EXT4_GET_BLOCKS_PRE_IO &&
+ ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+ if (ext4_es_is_written(&es))
+ return retval;
+ }
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status, flags);
+
return retval;
}
@@ -494,9 +583,10 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
* Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
* based files
*
- * On success, it returns the number of blocks being mapped or allocated. if
- * create==0 and the blocks are pre-allocated and unwritten, the resulting @map
- * is marked as unwritten. If the create == 1, it will mark @map as mapped.
+ * On success, it returns the number of blocks being mapped or allocated.
+ * If flags doesn't contain EXT4_GET_BLOCKS_CREATE the blocks are
+ * pre-allocated and unwritten, the resulting @map is marked as unwritten.
+ * If the flags contain EXT4_GET_BLOCKS_CREATE, it will mark @map as mapped.
*
* It returns 0 if plain look up failed (blocks have not been allocated), in
* that case, @map is returned as unmapped but we still do fill map->m_len to
@@ -544,6 +634,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
map->m_len = retval;
} else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
map->m_pblk = 0;
+ map->m_flags |= ext4_es_is_delayed(&es) ?
+ EXT4_MAP_DELAYED : 0;
retval = es.es_len - (map->m_lblk - es.es_lblk);
if (retval > map->m_len)
retval = map->m_len;
@@ -573,32 +665,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* file system block.
*/
down_read(&EXT4_I(inode)->i_data_sem);
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- retval = ext4_ext_map_blocks(handle, inode, map, 0);
- } else {
- retval = ext4_ind_map_blocks(handle, inode, map, 0);
- }
- if (retval > 0) {
- unsigned int status;
-
- if (unlikely(retval != map->m_len)) {
- ext4_warning(inode->i_sb,
- "ES len assertion failed for inode "
- "%lu: retval %d != map->m_len %d",
- inode->i_ino, retval, map->m_len);
- WARN_ON(1);
- }
-
- status = map->m_flags & EXT4_MAP_UNWRITTEN ?
- EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
- !(status & EXTENT_STATUS_WRITTEN) &&
- ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
- map->m_lblk + map->m_len - 1))
- status |= EXTENT_STATUS_DELAYED;
- ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status);
- }
+ retval = ext4_map_query_blocks(handle, inode, map);
up_read((&EXT4_I(inode)->i_data_sem));
found:
@@ -616,8 +683,7 @@ found:
* Returns if the blocks have already allocated
*
* Note that if blocks have been preallocated
- * ext4_ext_get_block() returns the create = 0
- * with buffer head unmapped.
+ * ext4_ext_map_blocks() returns with buffer head unmapped
*/
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
/*
@@ -629,88 +695,13 @@ found:
return retval;
/*
- * Here we clear m_flags because after allocating an new extent,
- * it will be set again.
- */
- map->m_flags &= ~EXT4_MAP_FLAGS;
-
- /*
* New blocks allocate and/or writing to unwritten extent
* will possibly result in updating i_data, so we take
* the write lock of i_data_sem, and call get_block()
* with create == 1 flag.
*/
down_write(&EXT4_I(inode)->i_data_sem);
-
- /*
- * We need to check for EXT4 here because migrate
- * could have changed the inode type in between
- */
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- retval = ext4_ext_map_blocks(handle, inode, map, flags);
- } else {
- retval = ext4_ind_map_blocks(handle, inode, map, flags);
-
- if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
- /*
- * We allocated new blocks which will result in
- * i_data's format changing. Force the migrate
- * to fail by clearing migrate flags
- */
- ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
- }
- }
-
- if (retval > 0) {
- unsigned int status;
-
- if (unlikely(retval != map->m_len)) {
- ext4_warning(inode->i_sb,
- "ES len assertion failed for inode "
- "%lu: retval %d != map->m_len %d",
- inode->i_ino, retval, map->m_len);
- WARN_ON(1);
- }
-
- /*
- * We have to zeroout blocks before inserting them into extent
- * status tree. Otherwise someone could look them up there and
- * use them before they are really zeroed. We also have to
- * unmap metadata before zeroing as otherwise writeback can
- * overwrite zeros with stale data from block device.
- */
- if (flags & EXT4_GET_BLOCKS_ZERO &&
- map->m_flags & EXT4_MAP_MAPPED &&
- map->m_flags & EXT4_MAP_NEW) {
- ret = ext4_issue_zeroout(inode, map->m_lblk,
- map->m_pblk, map->m_len);
- if (ret) {
- retval = ret;
- goto out_sem;
- }
- }
-
- /*
- * If the extent has been zeroed out, we don't need to update
- * extent status tree.
- */
- if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
- ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
- if (ext4_es_is_written(&es))
- goto out_sem;
- }
- status = map->m_flags & EXT4_MAP_UNWRITTEN ?
- EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
- !(status & EXTENT_STATUS_WRITTEN) &&
- ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
- map->m_lblk + map->m_len - 1))
- status |= EXTENT_STATUS_DELAYED;
- ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status);
- }
-
-out_sem:
+ retval = ext4_map_create_blocks(handle, inode, map, flags);
up_write((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
ret = check_block_validity(inode, map);
@@ -1009,39 +1000,28 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode,
*/
static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh)
{
- folio_mark_dirty(bh->b_folio);
+ struct folio *folio = bh->b_folio;
+ struct inode *inode = folio->mapping->host;
+
+ /* only regular files have a_ops */
+ if (S_ISREG(inode->i_mode))
+ folio_mark_dirty(folio);
return ext4_handle_dirty_metadata(handle, NULL, bh);
}
int do_journal_get_write_access(handle_t *handle, struct inode *inode,
struct buffer_head *bh)
{
- int dirty = buffer_dirty(bh);
- int ret;
-
if (!buffer_mapped(bh) || buffer_freed(bh))
return 0;
- /*
- * __block_write_begin() could have dirtied some buffers. Clean
- * the dirty bit as jbd2_journal_get_write_access() could complain
- * otherwise about fs integrity issues. Setting of the dirty bit
- * by __block_write_begin() isn't a real problem here as we clear
- * the bit before releasing a page lock and thus writeback cannot
- * ever write the buffer.
- */
- if (dirty)
- clear_buffer_dirty(bh);
BUFFER_TRACE(bh, "get write access");
- ret = ext4_journal_get_write_access(handle, inode->i_sb, bh,
+ return ext4_journal_get_write_access(handle, inode->i_sb, bh,
EXT4_JTR_NONE);
- if (!ret && dirty)
- ret = ext4_dirty_journalled_data(handle, bh);
- return ret;
}
-#ifdef CONFIG_FS_ENCRYPTION
-static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
- get_block_t *get_block)
+int ext4_block_write_begin(handle_t *handle, struct folio *folio,
+ loff_t pos, unsigned len,
+ get_block_t *get_block)
{
unsigned from = pos & (PAGE_SIZE - 1);
unsigned to = from + len;
@@ -1054,6 +1034,7 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
struct buffer_head *bh, *head, *wait[2];
int nr_wait = 0;
int i;
+ bool should_journal_data = ext4_should_journal_data(inode);
BUG_ON(!folio_test_locked(folio));
BUG_ON(from > PAGE_SIZE);
@@ -1061,10 +1042,8 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
BUG_ON(from > to);
head = folio_buffers(folio);
- if (!head) {
- create_empty_buffers(&folio->page, blocksize, 0);
- head = folio_buffers(folio);
- }
+ if (!head)
+ head = create_empty_buffers(folio, blocksize, 0);
bbits = ilog2(blocksize);
block = (sector_t)folio->index << (PAGE_SHIFT - bbits);
@@ -1077,7 +1056,7 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
}
continue;
}
- if (buffer_new(bh))
+ if (WARN_ON_ONCE(buffer_new(bh)))
clear_buffer_new(bh);
if (!buffer_mapped(bh)) {
WARN_ON(bh->b_size != blocksize);
@@ -1085,10 +1064,22 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
if (err)
break;
if (buffer_new(bh)) {
+ /*
+ * We may be zeroing partial buffers or all new
+ * buffers in case of failure. Prepare JBD2 for
+ * that.
+ */
+ if (should_journal_data)
+ do_journal_get_write_access(handle,
+ inode, bh);
if (folio_test_uptodate(folio)) {
- clear_buffer_new(bh);
+ /*
+ * Unlike __block_write_begin() we leave
+ * dirtying of new uptodate buffers to
+ * ->write_end() time or
+ * folio_zero_new_buffers().
+ */
set_buffer_uptodate(bh);
- mark_buffer_dirty(bh);
continue;
}
if (block_end > to || block_start < from)
@@ -1118,7 +1109,11 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
err = -EIO;
}
if (unlikely(err)) {
- folio_zero_new_buffers(folio, from, to);
+ if (should_journal_data)
+ ext4_journalled_zero_new_buffers(handle, inode, folio,
+ from, to);
+ else
+ folio_zero_new_buffers(folio, from, to);
} else if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
for (i = 0; i < nr_wait; i++) {
int err2;
@@ -1134,7 +1129,6 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
return err;
}
-#endif
/*
* To preserve ordering, it is essential that the hole instantiation and
@@ -1145,7 +1139,7 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
*/
static int ext4_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
struct inode *inode = mapping->host;
int ret, needed_blocks;
@@ -1170,7 +1164,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
- pagep);
+ foliop);
if (ret < 0)
return ret;
if (ret == 1)
@@ -1194,7 +1188,7 @@ retry_grab:
* starting the handle.
*/
if (!folio_buffers(folio))
- create_empty_buffers(&folio->page, inode->i_sb->s_blocksize, 0);
+ create_empty_buffers(folio, inode->i_sb->s_blocksize, 0);
folio_unlock(folio);
@@ -1216,19 +1210,12 @@ retry_journal:
/* In case writeback began while the folio was unlocked */
folio_wait_stable(folio);
-#ifdef CONFIG_FS_ENCRYPTION
if (ext4_should_dioread_nolock(inode))
- ret = ext4_block_write_begin(folio, pos, len,
+ ret = ext4_block_write_begin(handle, folio, pos, len,
ext4_get_block_unwritten);
else
- ret = ext4_block_write_begin(folio, pos, len, ext4_get_block);
-#else
- if (ext4_should_dioread_nolock(inode))
- ret = __block_write_begin(&folio->page, pos, len,
- ext4_get_block_unwritten);
- else
- ret = __block_write_begin(&folio->page, pos, len, ext4_get_block);
-#endif
+ ret = ext4_block_write_begin(handle, folio, pos, len,
+ ext4_get_block);
if (!ret && ext4_should_journal_data(inode)) {
ret = ext4_walk_page_buffers(handle, inode,
folio_buffers(folio), from, to,
@@ -1241,7 +1228,7 @@ retry_journal:
folio_unlock(folio);
/*
- * __block_write_begin may have instantiated a few blocks
+ * ext4_block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
* i_size_read because we hold i_rwsem.
*
@@ -1270,7 +1257,7 @@ retry_journal:
folio_put(folio);
return ret;
}
- *pagep = &folio->page;
+ *foliop = folio;
return ret;
}
@@ -1285,6 +1272,7 @@ static int write_end_fn(handle_t *handle, struct inode *inode,
ret = ext4_dirty_journalled_data(handle, bh);
clear_buffer_meta(bh);
clear_buffer_prio(bh);
+ clear_buffer_new(bh);
return ret;
}
@@ -1292,15 +1280,14 @@ static int write_end_fn(handle_t *handle, struct inode *inode,
* We need to pick up the new inode size which generic_commit_write gave us
* `file' can be NULL - eg, when called from page_symlink().
*
- * ext4 never places buffers on inode->i_mapping->private_list. metadata
+ * ext4 never places buffers on inode->i_mapping->i_private_list. metadata
* buffers are managed internally.
*/
static int ext4_write_end(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
- struct folio *folio = page_folio(page);
handle_t *handle = ext4_journal_current_handle();
struct inode *inode = mapping->host;
loff_t old_size = inode->i_size;
@@ -1315,7 +1302,7 @@ static int ext4_write_end(struct file *file,
return ext4_write_inline_data_end(inode, pos, len, copied,
folio);
- copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+ copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata);
/*
* it's important to update i_size while still holding folio lock:
* page writeout could otherwise come in and zero beyond i_size.
@@ -1328,8 +1315,10 @@ static int ext4_write_end(struct file *file,
folio_unlock(folio);
folio_put(folio);
- if (old_size < pos && !verity)
+ if (old_size < pos && !verity) {
pagecache_isize_extended(inode, old_size, pos);
+ ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size);
+ }
/*
* Don't mark the inode dirty under folio lock. First, it unnecessarily
* makes the holding time of folio lock longer. Second, it forces lock
@@ -1389,9 +1378,9 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle,
size = min(to, block_end) - start;
folio_zero_range(folio, start, size);
- write_end_fn(handle, inode, bh);
}
clear_buffer_new(bh);
+ write_end_fn(handle, inode, bh);
}
}
block_start = block_end;
@@ -1402,9 +1391,8 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle,
static int ext4_journalled_write_end(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
- struct folio *folio = page_folio(page);
handle_t *handle = ext4_journal_current_handle();
struct inode *inode = mapping->host;
loff_t old_size = inode->i_size;
@@ -1445,8 +1433,10 @@ static int ext4_journalled_write_end(struct file *file,
folio_unlock(folio);
folio_put(folio);
- if (old_size < pos && !verity)
+ if (old_size < pos && !verity) {
pagecache_isize_extended(inode, old_size, pos);
+ ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size);
+ }
if (size_changed) {
ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -1479,9 +1469,9 @@ static int ext4_journalled_write_end(struct file *file,
}
/*
- * Reserve space for a single cluster
+ * Reserve space for 'nr_resv' clusters
*/
-static int ext4_da_reserve_space(struct inode *inode)
+static int ext4_da_reserve_space(struct inode *inode, int nr_resv)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -1492,18 +1482,18 @@ static int ext4_da_reserve_space(struct inode *inode)
* us from metadata over-estimation, though we may go over by
* a small amount in the end. Here we just reserve for data.
*/
- ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
+ ret = dquot_reserve_block(inode, EXT4_C2B(sbi, nr_resv));
if (ret)
return ret;
spin_lock(&ei->i_block_reservation_lock);
- if (ext4_claim_free_clusters(sbi, 1, 0)) {
+ if (ext4_claim_free_clusters(sbi, nr_resv, 0)) {
spin_unlock(&ei->i_block_reservation_lock);
- dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, nr_resv));
return -ENOSPC;
}
- ei->i_reserved_data_blocks++;
- trace_ext4_da_reserve_space(inode);
+ ei->i_reserved_data_blocks += nr_resv;
+ trace_ext4_da_reserve_space(inode, nr_resv);
spin_unlock(&ei->i_block_reservation_lock);
return 0; /* success */
@@ -1650,24 +1640,58 @@ static void ext4_print_free_blocks(struct inode *inode)
}
/*
- * ext4_insert_delayed_block - adds a delayed block to the extents status
- * tree, incrementing the reserved cluster/block
- * count or making a pending reservation
- * where needed
+ * Check whether the cluster containing lblk has been allocated or has
+ * delalloc reservation.
+ *
+ * Returns 0 if the cluster doesn't have either, 1 if it has delalloc
+ * reservation, 2 if it's already been allocated, negative error code on
+ * failure.
+ */
+static int ext4_clu_alloc_state(struct inode *inode, ext4_lblk_t lblk)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int ret;
+
+ /* Has delalloc reservation? */
+ if (ext4_es_scan_clu(inode, &ext4_es_is_delayed, lblk))
+ return 1;
+
+ /* Already been allocated? */
+ if (ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk))
+ return 2;
+ ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk));
+ if (ret < 0)
+ return ret;
+ if (ret > 0)
+ return 2;
+
+ return 0;
+}
+
+/*
+ * ext4_insert_delayed_blocks - adds a multiple delayed blocks to the extents
+ * status tree, incrementing the reserved
+ * cluster/block count or making pending
+ * reservations where needed
*
* @inode - file containing the newly added block
- * @lblk - logical block to be added
+ * @lblk - start logical block to be added
+ * @len - length of blocks to be added
*
* Returns 0 on success, negative error code on failure.
*/
-static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
+static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int ret;
- bool allocated = false;
+ bool lclu_allocated = false;
+ bool end_allocated = false;
+ ext4_lblk_t resv_clu;
+ ext4_lblk_t end = lblk + len - 1;
/*
- * If the cluster containing lblk is shared with a delayed,
+ * If the cluster containing lblk or end is shared with a delayed,
* written, or unwritten extent in a bigalloc file system, it's
* already been accounted for and does not need to be reserved.
* A pending reservation must be made for the cluster if it's
@@ -1678,62 +1702,70 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
* extents status tree doesn't get a match.
*/
if (sbi->s_cluster_ratio == 1) {
- ret = ext4_da_reserve_space(inode);
+ ret = ext4_da_reserve_space(inode, len);
if (ret != 0) /* ENOSPC */
return ret;
} else { /* bigalloc */
- if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
- if (!ext4_es_scan_clu(inode,
- &ext4_es_is_mapped, lblk)) {
- ret = ext4_clu_mapped(inode,
- EXT4_B2C(sbi, lblk));
- if (ret < 0)
- return ret;
- if (ret == 0) {
- ret = ext4_da_reserve_space(inode);
- if (ret != 0) /* ENOSPC */
- return ret;
- } else {
- allocated = true;
- }
- } else {
- allocated = true;
+ resv_clu = EXT4_B2C(sbi, end) - EXT4_B2C(sbi, lblk) + 1;
+
+ ret = ext4_clu_alloc_state(inode, lblk);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ resv_clu--;
+ lclu_allocated = (ret == 2);
+ }
+
+ if (EXT4_B2C(sbi, lblk) != EXT4_B2C(sbi, end)) {
+ ret = ext4_clu_alloc_state(inode, end);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ resv_clu--;
+ end_allocated = (ret == 2);
}
}
+
+ if (resv_clu) {
+ ret = ext4_da_reserve_space(inode, resv_clu);
+ if (ret != 0) /* ENOSPC */
+ return ret;
+ }
}
- ext4_es_insert_delayed_block(inode, lblk, allocated);
+ ext4_es_insert_delayed_extent(inode, lblk, len, lclu_allocated,
+ end_allocated);
return 0;
}
/*
- * This function is grabs code from the very beginning of
- * ext4_map_blocks, but assumes that the caller is from delayed write
- * time. This function looks up the requested blocks and sets the
- * buffer delay bit under the protection of i_data_sem.
+ * Looks up the requested blocks and sets the delalloc extent map.
+ * First try to look up for the extent entry that contains the requested
+ * blocks in the extent status tree without i_data_sem, then try to look
+ * up for the ondisk extent mapping with i_data_sem in read mode,
+ * finally hold i_data_sem in write mode, looks up again and add a
+ * delalloc extent entry if it still couldn't find any extent. Pass out
+ * the mapped extent through @map and return 0 on success.
*/
-static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
- struct ext4_map_blocks *map,
- struct buffer_head *bh)
+static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
{
struct extent_status es;
int retval;
- sector_t invalid_block = ~((sector_t) 0xffff);
#ifdef ES_AGGRESSIVE_TEST
struct ext4_map_blocks orig_map;
memcpy(&orig_map, map, sizeof(*map));
#endif
- if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
- invalid_block = ~0;
-
map->m_flags = 0;
ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len,
(unsigned long) map->m_lblk);
/* Lookup extent status tree firstly */
- if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
+ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+ map->m_len = min_t(unsigned int, map->m_len,
+ es.es_len - (map->m_lblk - es.es_lblk));
+
if (ext4_es_is_hole(&es))
goto add_delayed;
@@ -1742,18 +1774,12 @@ found:
* Delayed extent could be allocated by fallocate.
* So we need to check it.
*/
- if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
- map_bh(bh, inode->i_sb, invalid_block);
- set_buffer_new(bh);
- set_buffer_delay(bh);
+ if (ext4_es_is_delayed(&es)) {
+ map->m_flags |= EXT4_MAP_DELAYED;
return 0;
}
- map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
- retval = es.es_len - (iblock - es.es_lblk);
- if (retval > map->m_len)
- retval = map->m_len;
- map->m_len = retval;
+ map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk;
if (ext4_es_is_written(&es))
map->m_flags |= EXT4_MAP_MAPPED;
else if (ext4_es_is_unwritten(&es))
@@ -1764,7 +1790,7 @@ found:
#ifdef ES_AGGRESSIVE_TEST
ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
#endif
- return retval;
+ return 0;
}
/*
@@ -1778,7 +1804,7 @@ found:
retval = ext4_map_query_blocks(NULL, inode, map);
up_read(&EXT4_I(inode)->i_data_sem);
if (retval)
- return retval;
+ return retval < 0 ? retval : 0;
add_delayed:
down_write(&EXT4_I(inode)->i_data_sem);
@@ -1789,7 +1815,10 @@ add_delayed:
* is held in write mode, before inserting a new da entry in
* the extent status tree.
*/
- if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
+ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+ map->m_len = min_t(unsigned int, map->m_len,
+ es.es_len - (map->m_lblk - es.es_lblk));
+
if (!ext4_es_is_hole(&es)) {
up_write(&EXT4_I(inode)->i_data_sem);
goto found;
@@ -1798,18 +1827,14 @@ add_delayed:
retval = ext4_map_query_blocks(NULL, inode, map);
if (retval) {
up_write(&EXT4_I(inode)->i_data_sem);
- return retval;
+ return retval < 0 ? retval : 0;
}
}
- retval = ext4_insert_delayed_block(inode, map->m_lblk);
+ map->m_flags |= EXT4_MAP_DELAYED;
+ retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len);
up_write(&EXT4_I(inode)->i_data_sem);
- if (retval)
- return retval;
- map_bh(bh, inode->i_sb, invalid_block);
- set_buffer_new(bh);
- set_buffer_delay(bh);
return retval;
}
@@ -1829,11 +1854,15 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)
{
struct ext4_map_blocks map;
+ sector_t invalid_block = ~((sector_t) 0xffff);
int ret = 0;
BUG_ON(create == 0);
BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
+ if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
+ invalid_block = ~0;
+
map.m_lblk = iblock;
map.m_len = 1;
@@ -1842,10 +1871,17 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
* preallocated blocks are unmapped but should treated
* the same as allocated blocks.
*/
- ret = ext4_da_map_blocks(inode, iblock, &map, bh);
- if (ret <= 0)
+ ret = ext4_da_map_blocks(inode, &map);
+ if (ret < 0)
return ret;
+ if (map.m_flags & EXT4_MAP_DELAYED) {
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
+ return 0;
+ }
+
map_bh(bh, inode->i_sb, map.m_pblk);
ext4_update_bh_state(bh, map.m_flags);
@@ -1893,7 +1929,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
len = folio_size(folio);
if (folio_pos(folio) + len > size &&
!ext4_verity_in_progress(mpd->inode))
- len = size & ~PAGE_MASK;
+ len = size & (len - 1);
err = ext4_bio_write_folio(&mpd->io_submit, folio, len);
if (!err)
mpd->wbc->nr_to_write--;
@@ -2173,11 +2209,6 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
* writeback and there is nothing we can do about it so it might result
* in data loss. So use reserved blocks to allocate metadata if
* possible.
- *
- * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if
- * the blocks in question are delalloc blocks. This indicates
- * that the blocks and quotas has already been checked when
- * the data was copied into the page cache.
*/
get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
EXT4_GET_BLOCKS_METADATA_NOFAIL |
@@ -2185,8 +2216,6 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
dioread_nolock = ext4_should_dioread_nolock(inode);
if (dioread_nolock)
get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
- if (map->m_flags & BIT(BH_Delay))
- get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
if (err < 0)
@@ -2880,7 +2909,7 @@ static int ext4_nonda_switch(struct super_block *sb)
static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
int ret, retries = 0;
struct folio *folio;
@@ -2895,14 +2924,14 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) {
*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
return ext4_write_begin(file, mapping, pos,
- len, pagep, fsdata);
+ len, foliop, fsdata);
}
*fsdata = (void *)0;
trace_ext4_da_write_begin(inode, pos, len);
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len,
- pagep, fsdata);
+ foliop, fsdata);
if (ret < 0)
return ret;
if (ret == 1)
@@ -2915,11 +2944,8 @@ retry:
if (IS_ERR(folio))
return PTR_ERR(folio);
-#ifdef CONFIG_FS_ENCRYPTION
- ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep);
-#else
- ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep);
-#endif
+ ret = ext4_block_write_begin(NULL, folio, pos, len,
+ ext4_da_get_block_prep);
if (ret < 0) {
folio_unlock(folio);
folio_put(folio);
@@ -2937,7 +2963,7 @@ retry:
return ret;
}
- *pagep = &folio->page;
+ *foliop = folio;
return ret;
}
@@ -2971,7 +2997,8 @@ static int ext4_da_do_write_end(struct address_space *mapping,
struct inode *inode = mapping->host;
loff_t old_size = inode->i_size;
bool disksize_changed = false;
- loff_t new_i_size;
+ loff_t new_i_size, zero_len = 0;
+ handle_t *handle;
if (unlikely(!folio_buffers(folio))) {
folio_unlock(folio);
@@ -2983,7 +3010,7 @@ static int ext4_da_do_write_end(struct address_space *mapping,
* flag, which all that's needed to trigger page writeback.
*/
copied = block_write_end(NULL, mapping, pos, len, copied,
- &folio->page, NULL);
+ folio, NULL);
new_i_size = pos + copied;
/*
@@ -3015,18 +3042,21 @@ static int ext4_da_do_write_end(struct address_space *mapping,
folio_unlock(folio);
folio_put(folio);
- if (old_size < pos)
+ if (pos > old_size) {
pagecache_isize_extended(inode, old_size, pos);
+ zero_len = pos - old_size;
+ }
- if (disksize_changed) {
- handle_t *handle;
+ if (!disksize_changed && !zero_len)
+ return copied;
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- ext4_mark_inode_dirty(handle, inode);
- ext4_journal_stop(handle);
- }
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ if (zero_len)
+ ext4_zero_partial_blocks(handle, inode, old_size, zero_len);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
return copied;
}
@@ -3034,15 +3064,14 @@ static int ext4_da_do_write_end(struct address_space *mapping,
static int ext4_da_write_end(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
int write_mode = (int)(unsigned long)fsdata;
- struct folio *folio = page_folio(page);
if (write_mode == FALL_BACK_TO_NONDELALLOC)
return ext4_write_end(file, mapping, pos,
- len, copied, &folio->page, fsdata);
+ len, copied, folio, fsdata);
trace_ext4_da_write_end(inode, pos, len, copied);
@@ -3238,7 +3267,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
}
/* Any metadata buffers to write? */
- if (!list_empty(&inode->i_mapping->private_list))
+ if (!list_empty(&inode->i_mapping->i_private_list))
return true;
return inode->i_state & I_DIRTY_DATASYNC;
}
@@ -3292,6 +3321,9 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
iomap->addr = (u64) map->m_pblk << blkbits;
if (flags & IOMAP_DAX)
iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
+ } else if (map->m_flags & EXT4_MAP_DELAYED) {
+ iomap->type = IOMAP_DELALLOC;
+ iomap->addr = IOMAP_NULL_ADDR;
} else {
iomap->type = IOMAP_HOLE;
iomap->addr = IOMAP_NULL_ADDR;
@@ -3454,35 +3486,11 @@ const struct iomap_ops ext4_iomap_overwrite_ops = {
.iomap_end = ext4_iomap_end,
};
-static bool ext4_iomap_is_delalloc(struct inode *inode,
- struct ext4_map_blocks *map)
-{
- struct extent_status es;
- ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1;
-
- ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
- map->m_lblk, end, &es);
-
- if (!es.es_len || es.es_lblk > end)
- return false;
-
- if (es.es_lblk > map->m_lblk) {
- map->m_len = es.es_lblk - map->m_lblk;
- return false;
- }
-
- offset = map->m_lblk - es.es_lblk;
- map->m_len = es.es_len - offset;
-
- return true;
-}
-
static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
loff_t length, unsigned int flags,
struct iomap *iomap, struct iomap *srcmap)
{
int ret;
- bool delalloc = false;
struct ext4_map_blocks map;
u8 blkbits = inode->i_blkbits;
@@ -3523,13 +3531,8 @@ static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret < 0)
return ret;
- if (ret == 0)
- delalloc = ext4_iomap_is_delalloc(inode, &map);
-
set_iomap:
ext4_set_iomap(inode, iomap, &map, offset, length, flags);
- if (delalloc && iomap->type == IOMAP_HOLE)
- iomap->type = IOMAP_DELALLOC;
return 0;
}
@@ -3586,10 +3589,9 @@ static const struct address_space_operations ext4_aops = {
.bmap = ext4_bmap,
.invalidate_folio = ext4_invalidate_folio,
.release_folio = ext4_release_folio,
- .direct_IO = noop_direct_IO,
.migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.swap_activate = ext4_iomap_swap_activate,
};
@@ -3603,10 +3605,9 @@ static const struct address_space_operations ext4_journalled_aops = {
.bmap = ext4_bmap,
.invalidate_folio = ext4_journalled_invalidate_folio,
.release_folio = ext4_release_folio,
- .direct_IO = noop_direct_IO,
.migrate_folio = buffer_migrate_folio_norefs,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.swap_activate = ext4_iomap_swap_activate,
};
@@ -3620,16 +3621,14 @@ static const struct address_space_operations ext4_da_aops = {
.bmap = ext4_bmap,
.invalidate_folio = ext4_invalidate_folio,
.release_folio = ext4_release_folio,
- .direct_IO = noop_direct_IO,
.migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.swap_activate = ext4_iomap_swap_activate,
};
static const struct address_space_operations ext4_dax_aops = {
.writepages = ext4_dax_writepages,
- .direct_IO = noop_direct_IO,
.dirty_folio = noop_dirty_folio,
.bmap = ext4_bmap,
.swap_activate = ext4_iomap_swap_activate,
@@ -3655,6 +3654,12 @@ void ext4_set_aops(struct inode *inode)
inode->i_mapping->a_ops = &ext4_aops;
}
+/*
+ * Here we can't skip an unwritten buffer even though it usually reads zero
+ * because it might have data in pagecache (eg, if called from ext4_zero_range,
+ * ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a
+ * racing writeback can come later and flush the stale pagecache to disk.
+ */
static int __ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{
@@ -3678,10 +3683,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
bh = folio_buffers(folio);
- if (!bh) {
- create_empty_buffers(&folio->page, blocksize, 0);
- bh = folio_buffers(folio);
- }
+ if (!bh)
+ bh = create_empty_buffers(folio, blocksize, 0);
/* Find the buffer that contains "offset" */
pos = blocksize;
@@ -3883,6 +3886,68 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
return ret;
}
+static inline void ext4_truncate_folio(struct inode *inode,
+ loff_t start, loff_t end)
+{
+ unsigned long blocksize = i_blocksize(inode);
+ struct folio *folio;
+
+ /* Nothing to be done if no complete block needs to be truncated. */
+ if (round_up(start, blocksize) >= round_down(end, blocksize))
+ return;
+
+ folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT);
+ if (IS_ERR(folio))
+ return;
+
+ if (folio_mkclean(folio))
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+}
+
+int ext4_truncate_page_cache_block_range(struct inode *inode,
+ loff_t start, loff_t end)
+{
+ unsigned long blocksize = i_blocksize(inode);
+ int ret;
+
+ /*
+ * For journalled data we need to write (and checkpoint) pages
+ * before discarding page cache to avoid inconsitent data on disk
+ * in case of crash before freeing or unwritten converting trans
+ * is committed.
+ */
+ if (ext4_should_journal_data(inode)) {
+ ret = filemap_write_and_wait_range(inode->i_mapping, start,
+ end - 1);
+ if (ret)
+ return ret;
+ goto truncate_pagecache;
+ }
+
+ /*
+ * If the block size is less than the page size, the file's mapped
+ * blocks within one page could be freed or converted to unwritten.
+ * So it's necessary to remove writable userspace mappings, and then
+ * ext4_page_mkwrite() can be called during subsequent write access
+ * to these partial folios.
+ */
+ if (!IS_ALIGNED(start | end, PAGE_SIZE) &&
+ blocksize < PAGE_SIZE && start < inode->i_size) {
+ loff_t page_boundary = round_up(start, PAGE_SIZE);
+
+ ext4_truncate_folio(inode, start, min(page_boundary, end));
+ if (end > page_boundary)
+ ext4_truncate_folio(inode,
+ round_down(end, PAGE_SIZE), end);
+ }
+
+truncate_pagecache:
+ truncate_pagecache_range(inode, start, end - 1);
+ return 0;
+}
+
static void ext4_wait_dax_page(struct inode *inode)
{
filemap_invalidate_unlock(inode->i_mapping);
@@ -3927,91 +3992,56 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- ext4_lblk_t first_block, stop_block;
- struct address_space *mapping = inode->i_mapping;
- loff_t first_block_offset, last_block_offset, max_length;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ext4_lblk_t start_lblk, end_lblk;
+ loff_t max_end = sb->s_maxbytes;
+ loff_t end = offset + length;
handle_t *handle;
unsigned int credits;
- int ret = 0, ret2 = 0;
+ int ret;
trace_ext4_punch_hole(inode, offset, length, 0);
+ WARN_ON_ONCE(!inode_is_locked(inode));
/*
- * Write out all dirty pages to avoid race conditions
- * Then release them.
+ * For indirect-block based inodes, make sure that the hole within
+ * one block before last range.
*/
- if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
- ret = filemap_write_and_wait_range(mapping, offset,
- offset + length - 1);
- if (ret)
- return ret;
- }
-
- inode_lock(inode);
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize;
/* No need to punch hole beyond i_size */
- if (offset >= inode->i_size)
- goto out_mutex;
+ if (offset >= inode->i_size || offset >= max_end)
+ return 0;
/*
- * If the hole extends beyond i_size, set the hole
- * to end after the page that contains i_size
+ * If the hole extends beyond i_size, set the hole to end after
+ * the page that contains i_size.
*/
- if (offset + length > inode->i_size) {
- length = inode->i_size +
- PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) -
- offset;
- }
+ if (end > inode->i_size)
+ end = round_up(inode->i_size, PAGE_SIZE);
+ if (end > max_end)
+ end = max_end;
+ length = end - offset;
/*
- * For punch hole the length + offset needs to be within one block
- * before last range. Adjust the length if it goes beyond that limit.
+ * Attach jinode to inode for jbd2 if we do any zeroing of partial
+ * block.
*/
- max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
- if (offset + length > max_length)
- length = max_length - offset;
-
- if (offset & (sb->s_blocksize - 1) ||
- (offset + length) & (sb->s_blocksize - 1)) {
- /*
- * Attach jinode to inode for jbd2 if we do any zeroing of
- * partial block
- */
+ if (!IS_ALIGNED(offset | end, sb->s_blocksize)) {
ret = ext4_inode_attach_jinode(inode);
if (ret < 0)
- goto out_mutex;
-
+ return ret;
}
- /* Wait all existing dio workers, newcomers will block on i_rwsem */
- inode_dio_wait(inode);
-
- ret = file_modified(file);
- if (ret)
- goto out_mutex;
-
- /*
- * Prevent page faults from reinstantiating pages we have released from
- * page cache.
- */
- filemap_invalidate_lock(mapping);
- ret = ext4_break_layouts(inode);
+ ret = ext4_update_disksize_before_punch(inode, offset, length);
if (ret)
- goto out_dio;
-
- first_block_offset = round_up(offset, sb->s_blocksize);
- last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
+ return ret;
/* Now release the pages and zero block aligned part of pages*/
- if (last_block_offset > first_block_offset) {
- ret = ext4_update_disksize_before_punch(inode, offset, length);
- if (ret)
- goto out_dio;
- truncate_pagecache_range(inode, first_block_offset,
- last_block_offset);
- }
+ ret = ext4_truncate_page_cache_block_range(inode, offset, end);
+ if (ret)
+ return ret;
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
credits = ext4_writepage_trans_blocks(inode);
@@ -4021,52 +4051,51 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_std_error(sb, ret);
- goto out_dio;
+ return ret;
}
- ret = ext4_zero_partial_blocks(handle, inode, offset,
- length);
+ ret = ext4_zero_partial_blocks(handle, inode, offset, length);
if (ret)
- goto out_stop;
-
- first_block = (offset + sb->s_blocksize - 1) >>
- EXT4_BLOCK_SIZE_BITS(sb);
- stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+ goto out_handle;
/* If there are blocks to remove, do it */
- if (stop_block > first_block) {
+ start_lblk = EXT4_B_TO_LBLK(inode, offset);
+ end_lblk = end >> inode->i_blkbits;
+
+ if (end_lblk > start_lblk) {
+ ext4_lblk_t hole_len = end_lblk - start_lblk;
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
- ext4_es_remove_extent(inode, first_block,
- stop_block - first_block);
+ ext4_es_remove_extent(inode, start_lblk, hole_len);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- ret = ext4_ext_remove_space(inode, first_block,
- stop_block - 1);
+ ret = ext4_ext_remove_space(inode, start_lblk,
+ end_lblk - 1);
else
- ret = ext4_ind_remove_space(handle, inode, first_block,
- stop_block);
+ ret = ext4_ind_remove_space(handle, inode, start_lblk,
+ end_lblk);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_handle;
+ }
+ ext4_es_insert_extent(inode, start_lblk, hole_len, ~0,
+ EXTENT_STATUS_HOLE, 0);
up_write(&EXT4_I(inode)->i_data_sem);
}
- ext4_fc_track_range(handle, inode, first_block, stop_block);
+ ext4_fc_track_range(handle, inode, start_lblk, end_lblk);
+
+ ret = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(ret))
+ goto out_handle;
+
+ ext4_update_inode_fsync_trans(handle, inode, 1);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
-
- inode->i_mtime = inode_set_ctime_current(inode);
- ret2 = ext4_mark_inode_dirty(handle, inode);
- if (unlikely(ret2))
- ret = ret2;
- if (ret >= 0)
- ext4_update_inode_fsync_trans(handle, inode, 1);
-out_stop:
+out_handle:
ext4_journal_stop(handle);
-out_dio:
- filemap_invalidate_unlock(mapping);
-out_mutex:
- inode_unlock(inode);
return ret;
}
@@ -4190,7 +4219,7 @@ int ext4_truncate(struct inode *inode)
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
err = ext4_ext_truncate(handle, inode);
@@ -4215,7 +4244,7 @@ out_stop:
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
err2 = ext4_mark_inode_dirty(handle, inode);
if (unlikely(err2 && !err))
err = err2;
@@ -4319,8 +4348,8 @@ static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode
raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
EXT4_INODE_SET_CTIME(inode, raw_inode);
- EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
- EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
+ EXT4_INODE_SET_MTIME(inode, raw_inode);
+ EXT4_INODE_SET_ATIME(inode, raw_inode);
EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
@@ -4684,22 +4713,43 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val)
inode_set_iversion_queried(inode, val);
}
-static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags)
-
+static int check_igot_inode(struct inode *inode, ext4_iget_flags flags,
+ const char *function, unsigned int line)
{
+ const char *err_str;
+
if (flags & EXT4_IGET_EA_INODE) {
- if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
- return "missing EA_INODE flag";
+ if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+ err_str = "missing EA_INODE flag";
+ goto error;
+ }
if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
- EXT4_I(inode)->i_file_acl)
- return "ea_inode with extended attributes";
+ EXT4_I(inode)->i_file_acl) {
+ err_str = "ea_inode with extended attributes";
+ goto error;
+ }
} else {
- if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
- return "unexpected EA_INODE flag";
+ if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+ /*
+ * open_by_handle_at() could provide an old inode number
+ * that has since been reused for an ea_inode; this does
+ * not indicate filesystem corruption
+ */
+ if (flags & EXT4_IGET_HANDLE)
+ return -ESTALE;
+ err_str = "unexpected EA_INODE flag";
+ goto error;
+ }
+ }
+ if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) {
+ err_str = "unexpected bad inode w/o EXT4_IGET_BAD";
+ goto error;
}
- if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD))
- return "unexpected bad inode w/o EXT4_IGET_BAD";
- return NULL;
+ return 0;
+
+error:
+ ext4_error_inode(inode, function, line, 0, err_str);
+ return -EFSCORRUPTED;
}
struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
@@ -4711,7 +4761,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
struct ext4_inode_info *ei;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct inode *inode;
- const char *err_str;
journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
loff_t size;
@@ -4740,10 +4789,10 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
if (!inode)
return ERR_PTR(-ENOMEM);
if (!(inode->i_state & I_NEW)) {
- if ((err_str = check_igot_inode(inode, flags)) != NULL) {
- ext4_error_inode(inode, function, line, 0, err_str);
+ ret = check_igot_inode(inode, flags, function, line);
+ if (ret) {
iput(inode);
- return ERR_PTR(-EFSCORRUPTED);
+ return ERR_PTR(ret);
}
return inode;
}
@@ -4855,7 +4904,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ei->i_file_acl |=
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
inode->i_size = ext4_isize(sb, raw_inode);
- if ((size = i_size_read(inode)) < 0) {
+ size = i_size_read(inode);
+ if (size < 0 || size > ext4_get_maxbytes(inode)) {
ext4_error_inode(inode, function, line, 0,
"iget: bad i_size value: %lld", size);
ret = -EFSCORRUPTED;
@@ -4928,8 +4978,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
}
EXT4_INODE_GET_CTIME(inode, raw_inode);
- EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
- EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
+ EXT4_INODE_GET_ATIME(inode, raw_inode);
+ EXT4_INODE_GET_MTIME(inode, raw_inode);
EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
@@ -5015,13 +5065,21 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ret = -EFSCORRUPTED;
goto bad_inode;
}
- if ((err_str = check_igot_inode(inode, flags)) != NULL) {
- ext4_error_inode(inode, function, line, 0, err_str);
- ret = -EFSCORRUPTED;
- goto bad_inode;
+ ret = check_igot_inode(inode, flags, function, line);
+ /*
+ * -ESTALE here means there is nothing inherently wrong with the inode,
+ * it's just not an inode we can return for an fhandle lookup.
+ */
+ if (ret == -ESTALE) {
+ brelse(iloc.bh);
+ unlock_new_inode(inode);
+ iput(inode);
+ return ERR_PTR(-ESTALE);
}
-
+ if (ret)
+ goto bad_inode;
brelse(iloc.bh);
+
unlock_new_inode(inode);
return inode;
@@ -5054,8 +5112,8 @@ static void __ext4_update_other_inode_time(struct super_block *sb,
spin_lock(&ei->i_raw_lock);
EXT4_INODE_SET_CTIME(inode, raw_inode);
- EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
- EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
+ EXT4_INODE_SET_MTIME(inode, raw_inode);
+ EXT4_INODE_SET_ATIME(inode, raw_inode);
ext4_inode_csum_set(inode, raw_inode, ei);
spin_unlock(&ei->i_raw_lock);
trace_ext4_other_inode_update_time(inode, orig_ino);
@@ -5437,6 +5495,14 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
}
if (attr->ia_size != inode->i_size) {
+ /* attach jbd2 jinode for EOF folio tail zeroing */
+ if (attr->ia_size & (inode->i_sb->s_blocksize - 1) ||
+ oldsize & (inode->i_sb->s_blocksize - 1)) {
+ error = ext4_inode_attach_jinode(inode);
+ if (error)
+ goto out_mmap_sem;
+ }
+
handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
@@ -5447,11 +5513,17 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
orphan = 1;
}
/*
- * Update c/mtime on truncate up, ext4_truncate() will
- * update c/mtime in shrink case below
+ * Update c/mtime and tail zero the EOF folio on
+ * truncate up. ext4_truncate() handles the shrink case
+ * below.
*/
- if (!shrink)
- inode->i_mtime = inode_set_ctime_current(inode);
+ if (!shrink) {
+ inode_set_mtime_to_ts(inode,
+ inode_set_ctime_current(inode));
+ if (oldsize & (inode->i_sb->s_blocksize - 1))
+ ext4_block_truncate_page(handle,
+ inode->i_mapping, oldsize);
+ }
if (shrink)
ext4_fc_track_range(handle, inode,
@@ -6199,7 +6271,8 @@ retry_alloc:
if (folio_pos(folio) + len > size)
len = size - folio_pos(folio);
- err = __block_write_begin(&folio->page, 0, len, ext4_get_block);
+ err = ext4_block_write_begin(handle, folio, 0, len,
+ ext4_get_block);
if (!err) {
ret = VM_FAULT_SIGBUS;
if (ext4_journal_folio_buffers(handle, folio, len))
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 0bfe2ce589e2..1c77400bd88e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -312,13 +312,22 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
struct ext4_inode_info *ei1;
struct ext4_inode_info *ei2;
unsigned long tmp;
+ struct timespec64 ts1, ts2;
ei1 = EXT4_I(inode1);
ei2 = EXT4_I(inode2);
swap(inode1->i_version, inode2->i_version);
- swap(inode1->i_atime, inode2->i_atime);
- swap(inode1->i_mtime, inode2->i_mtime);
+
+ ts1 = inode_get_atime(inode1);
+ ts2 = inode_get_atime(inode2);
+ inode_set_atime_to_ts(inode1, ts2);
+ inode_set_atime_to_ts(inode2, ts1);
+
+ ts1 = inode_get_mtime(inode1);
+ ts2 = inode_get_mtime(inode2);
+ inode_set_mtime_to_ts(inode1, ts2);
+ inode_set_mtime_to_ts(inode2, ts1);
memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP;
@@ -458,7 +467,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
ext4_reset_inode_seed(inode);
ext4_reset_inode_seed(inode_bl);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
err = ext4_mark_inode_dirty(handle, inode);
if (err < 0) {
@@ -810,11 +819,11 @@ int ext4_force_shutdown(struct super_block *sb, u32 flags)
switch (flags) {
case EXT4_GOING_FLAGS_DEFAULT:
- ret = freeze_bdev(sb->s_bdev);
+ ret = bdev_freeze(sb->s_bdev);
if (ret)
return ret;
set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
- thaw_bdev(sb->s_bdev);
+ bdev_thaw(sb->s_bdev);
break;
case EXT4_GOING_FLAGS_LOGFLUSH:
set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
@@ -1141,9 +1150,8 @@ static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label
*/
BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX);
- memset(label, 0, sizeof(label));
lock_buffer(sbi->s_sbh);
- strncpy(label, sbi->s_es->s_volume_name, EXT4_LABEL_MAX);
+ memtostr_pad(label, sbi->s_es->s_volume_name);
unlock_buffer(sbi->s_sbh);
if (copy_to_user(user_label, label, sizeof(label)))
@@ -1335,10 +1343,10 @@ group_extend_out:
me.moved_len = 0;
donor = fdget(me.donor_fd);
- if (!donor.file)
+ if (!fd_file(donor))
return -EBADF;
- if (!(donor.file->f_mode & FMODE_WRITE)) {
+ if (!(fd_file(donor)->f_mode & FMODE_WRITE)) {
err = -EBADF;
goto mext_out;
}
@@ -1359,7 +1367,7 @@ group_extend_out:
if (err)
goto mext_out;
- err = ext4_move_extents(filp, donor.file, me.orig_start,
+ err = ext4_move_extents(filp, fd_file(donor), me.orig_start,
me.donor_start, me.len, &me.moved_len);
mnt_drop_write_file(filp);
diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c
new file mode 100644
index 000000000000..f13db95284d9
--- /dev/null
+++ b/fs/ext4/mballoc-test.c
@@ -0,0 +1,1001 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit test of ext4 multiblocks allocation.
+ */
+
+#include <kunit/test.h>
+#include <kunit/static_stub.h>
+#include <linux/random.h>
+
+#include "ext4.h"
+
+struct mbt_grp_ctx {
+ struct buffer_head bitmap_bh;
+ /* desc and gd_bh are just the place holders for now */
+ struct ext4_group_desc desc;
+ struct buffer_head gd_bh;
+};
+
+struct mbt_ctx {
+ struct mbt_grp_ctx *grp_ctx;
+};
+
+struct mbt_ext4_super_block {
+ struct ext4_super_block es;
+ struct ext4_sb_info sbi;
+ struct mbt_ctx mbt_ctx;
+};
+
+#define MBT_SB(_sb) (container_of((_sb)->s_fs_info, struct mbt_ext4_super_block, sbi))
+#define MBT_CTX(_sb) (&MBT_SB(_sb)->mbt_ctx)
+#define MBT_GRP_CTX(_sb, _group) (&MBT_CTX(_sb)->grp_ctx[_group])
+
+static struct inode *mbt_alloc_inode(struct super_block *sb)
+{
+ struct ext4_inode_info *ei;
+
+ ei = kmalloc(sizeof(struct ext4_inode_info), GFP_KERNEL);
+ if (!ei)
+ return NULL;
+
+ INIT_LIST_HEAD(&ei->i_orphan);
+ init_rwsem(&ei->xattr_sem);
+ init_rwsem(&ei->i_data_sem);
+ inode_init_once(&ei->vfs_inode);
+ ext4_fc_init_inode(&ei->vfs_inode);
+
+ return &ei->vfs_inode;
+}
+
+static void mbt_free_inode(struct inode *inode)
+{
+ kfree(EXT4_I(inode));
+}
+
+static const struct super_operations mbt_sops = {
+ .alloc_inode = mbt_alloc_inode,
+ .free_inode = mbt_free_inode,
+};
+
+static void mbt_kill_sb(struct super_block *sb)
+{
+ generic_shutdown_super(sb);
+}
+
+static struct file_system_type mbt_fs_type = {
+ .name = "mballoc test",
+ .kill_sb = mbt_kill_sb,
+};
+
+static int mbt_mb_init(struct super_block *sb)
+{
+ ext4_fsblk_t block;
+ int ret;
+
+ /* needed by ext4_mb_init->bdev_nonrot(sb->s_bdev) */
+ sb->s_bdev = kzalloc(sizeof(*sb->s_bdev), GFP_KERNEL);
+ if (sb->s_bdev == NULL)
+ return -ENOMEM;
+
+ sb->s_bdev->bd_queue = kzalloc(sizeof(struct request_queue), GFP_KERNEL);
+ if (sb->s_bdev->bd_queue == NULL) {
+ kfree(sb->s_bdev);
+ return -ENOMEM;
+ }
+
+ /*
+ * needed by ext4_mb_init->ext4_mb_init_backend-> sbi->s_buddy_cache =
+ * new_inode(sb);
+ */
+ INIT_LIST_HEAD(&sb->s_inodes);
+ sb->s_op = &mbt_sops;
+
+ ret = ext4_mb_init(sb);
+ if (ret != 0)
+ goto err_out;
+
+ block = ext4_count_free_clusters(sb);
+ ret = percpu_counter_init(&EXT4_SB(sb)->s_freeclusters_counter, block,
+ GFP_KERNEL);
+ if (ret != 0)
+ goto err_mb_release;
+
+ ret = percpu_counter_init(&EXT4_SB(sb)->s_dirtyclusters_counter, 0,
+ GFP_KERNEL);
+ if (ret != 0)
+ goto err_freeclusters;
+
+ return 0;
+
+err_freeclusters:
+ percpu_counter_destroy(&EXT4_SB(sb)->s_freeclusters_counter);
+err_mb_release:
+ ext4_mb_release(sb);
+err_out:
+ kfree(sb->s_bdev->bd_queue);
+ kfree(sb->s_bdev);
+ return ret;
+}
+
+static void mbt_mb_release(struct super_block *sb)
+{
+ percpu_counter_destroy(&EXT4_SB(sb)->s_dirtyclusters_counter);
+ percpu_counter_destroy(&EXT4_SB(sb)->s_freeclusters_counter);
+ ext4_mb_release(sb);
+ kfree(sb->s_bdev->bd_queue);
+ kfree(sb->s_bdev);
+}
+
+static int mbt_set(struct super_block *sb, void *data)
+{
+ return 0;
+}
+
+static struct super_block *mbt_ext4_alloc_super_block(void)
+{
+ struct mbt_ext4_super_block *fsb;
+ struct super_block *sb;
+ struct ext4_sb_info *sbi;
+
+ fsb = kzalloc(sizeof(*fsb), GFP_KERNEL);
+ if (fsb == NULL)
+ return NULL;
+
+ sb = sget(&mbt_fs_type, NULL, mbt_set, 0, NULL);
+ if (IS_ERR(sb))
+ goto out;
+
+ sbi = &fsb->sbi;
+
+ sbi->s_blockgroup_lock =
+ kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+ if (!sbi->s_blockgroup_lock)
+ goto out_deactivate;
+
+ bgl_lock_init(sbi->s_blockgroup_lock);
+
+ sbi->s_es = &fsb->es;
+ sbi->s_sb = sb;
+ sb->s_fs_info = sbi;
+
+ up_write(&sb->s_umount);
+ return sb;
+
+out_deactivate:
+ deactivate_locked_super(sb);
+out:
+ kfree(fsb);
+ return NULL;
+}
+
+static void mbt_ext4_free_super_block(struct super_block *sb)
+{
+ struct mbt_ext4_super_block *fsb = MBT_SB(sb);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ kfree(sbi->s_blockgroup_lock);
+ deactivate_super(sb);
+ kfree(fsb);
+}
+
+struct mbt_ext4_block_layout {
+ unsigned char blocksize_bits;
+ unsigned int cluster_bits;
+ uint32_t blocks_per_group;
+ ext4_group_t group_count;
+ uint16_t desc_size;
+};
+
+static void mbt_init_sb_layout(struct super_block *sb,
+ struct mbt_ext4_block_layout *layout)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+
+ sb->s_blocksize = 1UL << layout->blocksize_bits;
+ sb->s_blocksize_bits = layout->blocksize_bits;
+
+ sbi->s_groups_count = layout->group_count;
+ sbi->s_blocks_per_group = layout->blocks_per_group;
+ sbi->s_cluster_bits = layout->cluster_bits;
+ sbi->s_cluster_ratio = 1U << layout->cluster_bits;
+ sbi->s_clusters_per_group = layout->blocks_per_group >>
+ layout->cluster_bits;
+ sbi->s_desc_size = layout->desc_size;
+ sbi->s_desc_per_block_bits =
+ sb->s_blocksize_bits - (fls(layout->desc_size) - 1);
+ sbi->s_desc_per_block = 1 << sbi->s_desc_per_block_bits;
+
+ es->s_first_data_block = cpu_to_le32(0);
+ es->s_blocks_count_lo = cpu_to_le32(layout->blocks_per_group *
+ layout->group_count);
+}
+
+static int mbt_grp_ctx_init(struct super_block *sb,
+ struct mbt_grp_ctx *grp_ctx)
+{
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+
+ grp_ctx->bitmap_bh.b_data = kzalloc(EXT4_BLOCK_SIZE(sb), GFP_KERNEL);
+ if (grp_ctx->bitmap_bh.b_data == NULL)
+ return -ENOMEM;
+ mb_set_bits(grp_ctx->bitmap_bh.b_data, max, sb->s_blocksize * 8 - max);
+ ext4_free_group_clusters_set(sb, &grp_ctx->desc, max);
+
+ return 0;
+}
+
+static void mbt_grp_ctx_release(struct mbt_grp_ctx *grp_ctx)
+{
+ kfree(grp_ctx->bitmap_bh.b_data);
+ grp_ctx->bitmap_bh.b_data = NULL;
+}
+
+static void mbt_ctx_mark_used(struct super_block *sb, ext4_group_t group,
+ unsigned int start, unsigned int len)
+{
+ struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group);
+
+ mb_set_bits(grp_ctx->bitmap_bh.b_data, start, len);
+}
+
+static void *mbt_ctx_bitmap(struct super_block *sb, ext4_group_t group)
+{
+ struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group);
+
+ return grp_ctx->bitmap_bh.b_data;
+}
+
+/* called after mbt_init_sb_layout */
+static int mbt_ctx_init(struct super_block *sb)
+{
+ struct mbt_ctx *ctx = MBT_CTX(sb);
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+
+ ctx->grp_ctx = kcalloc(ngroups, sizeof(struct mbt_grp_ctx),
+ GFP_KERNEL);
+ if (ctx->grp_ctx == NULL)
+ return -ENOMEM;
+
+ for (i = 0; i < ngroups; i++)
+ if (mbt_grp_ctx_init(sb, &ctx->grp_ctx[i]))
+ goto out;
+
+ /*
+ * first data block(first cluster in first group) is used by
+ * metadata, mark it used to avoid to alloc data block at first
+ * block which will fail ext4_sb_block_valid check.
+ */
+ mb_set_bits(ctx->grp_ctx[0].bitmap_bh.b_data, 0, 1);
+ ext4_free_group_clusters_set(sb, &ctx->grp_ctx[0].desc,
+ EXT4_CLUSTERS_PER_GROUP(sb) - 1);
+
+ return 0;
+out:
+ while (i-- > 0)
+ mbt_grp_ctx_release(&ctx->grp_ctx[i]);
+ kfree(ctx->grp_ctx);
+ return -ENOMEM;
+}
+
+static void mbt_ctx_release(struct super_block *sb)
+{
+ struct mbt_ctx *ctx = MBT_CTX(sb);
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+
+ for (i = 0; i < ngroups; i++)
+ mbt_grp_ctx_release(&ctx->grp_ctx[i]);
+ kfree(ctx->grp_ctx);
+}
+
+static struct buffer_head *
+ext4_read_block_bitmap_nowait_stub(struct super_block *sb, ext4_group_t block_group,
+ bool ignore_locked)
+{
+ struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group);
+
+ /* paired with brelse from caller of ext4_read_block_bitmap_nowait */
+ get_bh(&grp_ctx->bitmap_bh);
+ return &grp_ctx->bitmap_bh;
+}
+
+static int ext4_wait_block_bitmap_stub(struct super_block *sb,
+ ext4_group_t block_group,
+ struct buffer_head *bh)
+{
+ /*
+ * real ext4_wait_block_bitmap will set these flags and
+ * functions like ext4_mb_init_cache will verify the flags.
+ */
+ set_buffer_uptodate(bh);
+ set_bitmap_uptodate(bh);
+ set_buffer_verified(bh);
+ return 0;
+}
+
+static struct ext4_group_desc *
+ext4_get_group_desc_stub(struct super_block *sb, ext4_group_t block_group,
+ struct buffer_head **bh)
+{
+ struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group);
+
+ if (bh != NULL)
+ *bh = &grp_ctx->gd_bh;
+
+ return &grp_ctx->desc;
+}
+
+static int
+ext4_mb_mark_context_stub(handle_t *handle, struct super_block *sb, bool state,
+ ext4_group_t group, ext4_grpblk_t blkoff,
+ ext4_grpblk_t len, int flags,
+ ext4_grpblk_t *ret_changed)
+{
+ struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group);
+ struct buffer_head *bitmap_bh = &grp_ctx->bitmap_bh;
+
+ if (state)
+ mb_set_bits(bitmap_bh->b_data, blkoff, len);
+ else
+ mb_clear_bits(bitmap_bh->b_data, blkoff, len);
+
+ return 0;
+}
+
+#define TEST_GOAL_GROUP 1
+static int mbt_kunit_init(struct kunit *test)
+{
+ struct mbt_ext4_block_layout *layout =
+ (struct mbt_ext4_block_layout *)(test->param_value);
+ struct super_block *sb;
+ int ret;
+
+ sb = mbt_ext4_alloc_super_block();
+ if (sb == NULL)
+ return -ENOMEM;
+
+ mbt_init_sb_layout(sb, layout);
+
+ ret = mbt_ctx_init(sb);
+ if (ret != 0) {
+ mbt_ext4_free_super_block(sb);
+ return ret;
+ }
+
+ test->priv = sb;
+ kunit_activate_static_stub(test,
+ ext4_read_block_bitmap_nowait,
+ ext4_read_block_bitmap_nowait_stub);
+ kunit_activate_static_stub(test,
+ ext4_wait_block_bitmap,
+ ext4_wait_block_bitmap_stub);
+ kunit_activate_static_stub(test,
+ ext4_get_group_desc,
+ ext4_get_group_desc_stub);
+ kunit_activate_static_stub(test,
+ ext4_mb_mark_context,
+ ext4_mb_mark_context_stub);
+
+ /* stub function will be called in mbt_mb_init->ext4_mb_init */
+ if (mbt_mb_init(sb) != 0) {
+ mbt_ctx_release(sb);
+ mbt_ext4_free_super_block(sb);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void mbt_kunit_exit(struct kunit *test)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+
+ mbt_mb_release(sb);
+ mbt_ctx_release(sb);
+ mbt_ext4_free_super_block(sb);
+}
+
+static void test_new_blocks_simple(struct kunit *test)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ struct inode *inode;
+ struct ext4_allocation_request ar;
+ ext4_group_t i, goal_group = TEST_GOAL_GROUP;
+ int err = 0;
+ ext4_fsblk_t found;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL);
+ if (!inode)
+ return;
+
+ inode->i_sb = sb;
+ ar.inode = inode;
+
+ /* get block at goal */
+ ar.goal = ext4_group_first_block_no(sb, goal_group);
+ found = ext4_mb_new_blocks_simple(&ar, &err);
+ KUNIT_ASSERT_EQ_MSG(test, ar.goal, found,
+ "failed to alloc block at goal, expected %llu found %llu",
+ ar.goal, found);
+
+ /* get block after goal in goal group */
+ ar.goal = ext4_group_first_block_no(sb, goal_group);
+ found = ext4_mb_new_blocks_simple(&ar, &err);
+ KUNIT_ASSERT_EQ_MSG(test, ar.goal + EXT4_C2B(sbi, 1), found,
+ "failed to alloc block after goal in goal group, expected %llu found %llu",
+ ar.goal + 1, found);
+
+ /* get block after goal group */
+ mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb));
+ ar.goal = ext4_group_first_block_no(sb, goal_group);
+ found = ext4_mb_new_blocks_simple(&ar, &err);
+ KUNIT_ASSERT_EQ_MSG(test,
+ ext4_group_first_block_no(sb, goal_group + 1), found,
+ "failed to alloc block after goal group, expected %llu found %llu",
+ ext4_group_first_block_no(sb, goal_group + 1), found);
+
+ /* get block before goal group */
+ for (i = goal_group; i < ext4_get_groups_count(sb); i++)
+ mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb));
+ ar.goal = ext4_group_first_block_no(sb, goal_group);
+ found = ext4_mb_new_blocks_simple(&ar, &err);
+ KUNIT_ASSERT_EQ_MSG(test,
+ ext4_group_first_block_no(sb, 0) + EXT4_C2B(sbi, 1), found,
+ "failed to alloc block before goal group, expected %llu found %llu",
+ ext4_group_first_block_no(sb, 0 + EXT4_C2B(sbi, 1)), found);
+
+ /* no block available, fail to allocate block */
+ for (i = 0; i < ext4_get_groups_count(sb); i++)
+ mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb));
+ ar.goal = ext4_group_first_block_no(sb, goal_group);
+ found = ext4_mb_new_blocks_simple(&ar, &err);
+ KUNIT_ASSERT_NE_MSG(test, err, 0,
+ "unexpectedly get block when no block is available");
+}
+
+#define TEST_RANGE_COUNT 8
+
+struct test_range {
+ ext4_grpblk_t start;
+ ext4_grpblk_t len;
+};
+
+static void
+mbt_generate_test_ranges(struct super_block *sb, struct test_range *ranges,
+ int count)
+{
+ ext4_grpblk_t start, len, max;
+ int i;
+
+ max = EXT4_CLUSTERS_PER_GROUP(sb) / count;
+ for (i = 0; i < count; i++) {
+ start = get_random_u32() % max;
+ len = get_random_u32() % max;
+ len = min(len, max - start);
+
+ ranges[i].start = start + i * max;
+ ranges[i].len = len;
+ }
+}
+
+static void
+validate_free_blocks_simple(struct kunit *test, struct super_block *sb,
+ ext4_group_t goal_group, ext4_grpblk_t start,
+ ext4_grpblk_t len)
+{
+ void *bitmap;
+ ext4_grpblk_t bit, max = EXT4_CLUSTERS_PER_GROUP(sb);
+ ext4_group_t i;
+
+ for (i = 0; i < ext4_get_groups_count(sb); i++) {
+ if (i == goal_group)
+ continue;
+
+ bitmap = mbt_ctx_bitmap(sb, i);
+ bit = mb_find_next_zero_bit(bitmap, max, 0);
+ KUNIT_ASSERT_EQ_MSG(test, bit, max,
+ "free block on unexpected group %d", i);
+ }
+
+ bitmap = mbt_ctx_bitmap(sb, goal_group);
+ bit = mb_find_next_zero_bit(bitmap, max, 0);
+ KUNIT_ASSERT_EQ(test, bit, start);
+
+ bit = mb_find_next_bit(bitmap, max, bit + 1);
+ KUNIT_ASSERT_EQ(test, bit, start + len);
+}
+
+static void
+test_free_blocks_simple_range(struct kunit *test, ext4_group_t goal_group,
+ ext4_grpblk_t start, ext4_grpblk_t len)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode;
+ ext4_fsblk_t block;
+
+ inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL);
+ if (!inode)
+ return;
+ inode->i_sb = sb;
+
+ if (len == 0)
+ return;
+
+ block = ext4_group_first_block_no(sb, goal_group) +
+ EXT4_C2B(sbi, start);
+ ext4_free_blocks_simple(inode, block, len);
+ validate_free_blocks_simple(test, sb, goal_group, start, len);
+ mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb));
+}
+
+static void test_free_blocks_simple(struct kunit *test)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+ ext4_group_t i;
+ struct test_range ranges[TEST_RANGE_COUNT];
+
+ for (i = 0; i < ext4_get_groups_count(sb); i++)
+ mbt_ctx_mark_used(sb, i, 0, max);
+
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+ for (i = 0; i < TEST_RANGE_COUNT; i++)
+ test_free_blocks_simple_range(test, TEST_GOAL_GROUP,
+ ranges[i].start, ranges[i].len);
+}
+
+static void
+test_mark_diskspace_used_range(struct kunit *test,
+ struct ext4_allocation_context *ac,
+ ext4_grpblk_t start,
+ ext4_grpblk_t len)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ int ret;
+ void *bitmap;
+ ext4_grpblk_t i, max;
+
+ /* ext4_mb_mark_diskspace_used will BUG if len is 0 */
+ if (len == 0)
+ return;
+
+ ac->ac_b_ex.fe_group = TEST_GOAL_GROUP;
+ ac->ac_b_ex.fe_start = start;
+ ac->ac_b_ex.fe_len = len;
+
+ bitmap = mbt_ctx_bitmap(sb, TEST_GOAL_GROUP);
+ memset(bitmap, 0, sb->s_blocksize);
+ ret = ext4_mb_mark_diskspace_used(ac, NULL, 0);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ max = EXT4_CLUSTERS_PER_GROUP(sb);
+ i = mb_find_next_bit(bitmap, max, 0);
+ KUNIT_ASSERT_EQ(test, i, start);
+ i = mb_find_next_zero_bit(bitmap, max, i + 1);
+ KUNIT_ASSERT_EQ(test, i, start + len);
+ i = mb_find_next_bit(bitmap, max, i + 1);
+ KUNIT_ASSERT_EQ(test, max, i);
+}
+
+static void test_mark_diskspace_used(struct kunit *test)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ struct inode *inode;
+ struct ext4_allocation_context ac;
+ struct test_range ranges[TEST_RANGE_COUNT];
+ int i;
+
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+
+ inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL);
+ if (!inode)
+ return;
+ inode->i_sb = sb;
+
+ ac.ac_status = AC_STATUS_FOUND;
+ ac.ac_sb = sb;
+ ac.ac_inode = inode;
+ for (i = 0; i < TEST_RANGE_COUNT; i++)
+ test_mark_diskspace_used_range(test, &ac, ranges[i].start,
+ ranges[i].len);
+}
+
+static void mbt_generate_buddy(struct super_block *sb, void *buddy,
+ void *bitmap, struct ext4_group_info *grp)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ uint32_t order, off;
+ void *bb, *bb_h;
+ int max;
+
+ memset(buddy, 0xff, sb->s_blocksize);
+ memset(grp, 0, offsetof(struct ext4_group_info,
+ bb_counters[MB_NUM_ORDERS(sb)]));
+
+ bb = bitmap;
+ max = EXT4_CLUSTERS_PER_GROUP(sb);
+ bb_h = buddy + sbi->s_mb_offsets[1];
+
+ off = mb_find_next_zero_bit(bb, max, 0);
+ grp->bb_first_free = off;
+ while (off < max) {
+ grp->bb_counters[0]++;
+ grp->bb_free++;
+
+ if (!(off & 1) && !mb_test_bit(off + 1, bb)) {
+ grp->bb_free++;
+ grp->bb_counters[0]--;
+ mb_clear_bit(off >> 1, bb_h);
+ grp->bb_counters[1]++;
+ grp->bb_largest_free_order = 1;
+ off++;
+ }
+
+ off = mb_find_next_zero_bit(bb, max, off + 1);
+ }
+
+ for (order = 1; order < MB_NUM_ORDERS(sb) - 1; order++) {
+ bb = buddy + sbi->s_mb_offsets[order];
+ bb_h = buddy + sbi->s_mb_offsets[order + 1];
+ max = max >> 1;
+ off = mb_find_next_zero_bit(bb, max, 0);
+
+ while (off < max) {
+ if (!(off & 1) && !mb_test_bit(off + 1, bb)) {
+ mb_set_bits(bb, off, 2);
+ grp->bb_counters[order] -= 2;
+ mb_clear_bit(off >> 1, bb_h);
+ grp->bb_counters[order + 1]++;
+ grp->bb_largest_free_order = order + 1;
+ off++;
+ }
+
+ off = mb_find_next_zero_bit(bb, max, off + 1);
+ }
+ }
+
+ max = EXT4_CLUSTERS_PER_GROUP(sb);
+ off = mb_find_next_zero_bit(bitmap, max, 0);
+ while (off < max) {
+ grp->bb_fragments++;
+
+ off = mb_find_next_bit(bitmap, max, off + 1);
+ if (off + 1 >= max)
+ break;
+
+ off = mb_find_next_zero_bit(bitmap, max, off + 1);
+ }
+}
+
+static void
+mbt_validate_group_info(struct kunit *test, struct ext4_group_info *grp1,
+ struct ext4_group_info *grp2)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ int i;
+
+ KUNIT_ASSERT_EQ(test, grp1->bb_first_free,
+ grp2->bb_first_free);
+ KUNIT_ASSERT_EQ(test, grp1->bb_fragments,
+ grp2->bb_fragments);
+ KUNIT_ASSERT_EQ(test, grp1->bb_free, grp2->bb_free);
+ KUNIT_ASSERT_EQ(test, grp1->bb_largest_free_order,
+ grp2->bb_largest_free_order);
+
+ for (i = 1; i < MB_NUM_ORDERS(sb); i++) {
+ KUNIT_ASSERT_EQ_MSG(test, grp1->bb_counters[i],
+ grp2->bb_counters[i],
+ "bb_counters[%d] diffs, expected %d, generated %d",
+ i, grp1->bb_counters[i],
+ grp2->bb_counters[i]);
+ }
+}
+
+static void
+do_test_generate_buddy(struct kunit *test, struct super_block *sb, void *bitmap,
+ void *mbt_buddy, struct ext4_group_info *mbt_grp,
+ void *ext4_buddy, struct ext4_group_info *ext4_grp)
+{
+ int i;
+
+ mbt_generate_buddy(sb, mbt_buddy, bitmap, mbt_grp);
+
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+ ext4_grp->bb_counters[i] = 0;
+ /* needed by validation in ext4_mb_generate_buddy */
+ ext4_grp->bb_free = mbt_grp->bb_free;
+ memset(ext4_buddy, 0xff, sb->s_blocksize);
+ ext4_mb_generate_buddy(sb, ext4_buddy, bitmap, TEST_GOAL_GROUP,
+ ext4_grp);
+
+ KUNIT_ASSERT_EQ(test, memcmp(mbt_buddy, ext4_buddy, sb->s_blocksize),
+ 0);
+ mbt_validate_group_info(test, mbt_grp, ext4_grp);
+}
+
+static void test_mb_generate_buddy(struct kunit *test)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ void *bitmap, *expected_bb, *generate_bb;
+ struct ext4_group_info *expected_grp, *generate_grp;
+ struct test_range ranges[TEST_RANGE_COUNT];
+ int i;
+
+ bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap);
+ expected_bb = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, expected_bb);
+ generate_bb = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, generate_bb);
+ expected_grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
+ bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, expected_grp);
+ generate_grp = ext4_get_group_info(sb, TEST_GOAL_GROUP);
+ KUNIT_ASSERT_NOT_NULL(test, generate_grp);
+
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+ for (i = 0; i < TEST_RANGE_COUNT; i++) {
+ mb_set_bits(bitmap, ranges[i].start, ranges[i].len);
+ do_test_generate_buddy(test, sb, bitmap, expected_bb,
+ expected_grp, generate_bb, generate_grp);
+ }
+}
+
+static void
+test_mb_mark_used_range(struct kunit *test, struct ext4_buddy *e4b,
+ ext4_grpblk_t start, ext4_grpblk_t len, void *bitmap,
+ void *buddy, struct ext4_group_info *grp)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ struct ext4_free_extent ex;
+ int i;
+
+ /* mb_mark_used only accepts non-zero len */
+ if (len == 0)
+ return;
+
+ ex.fe_start = start;
+ ex.fe_len = len;
+ ex.fe_group = TEST_GOAL_GROUP;
+
+ ext4_lock_group(sb, TEST_GOAL_GROUP);
+ mb_mark_used(e4b, &ex);
+ ext4_unlock_group(sb, TEST_GOAL_GROUP);
+
+ mb_set_bits(bitmap, start, len);
+ /* bypass bb_free validatoin in ext4_mb_generate_buddy */
+ grp->bb_free -= len;
+ memset(buddy, 0xff, sb->s_blocksize);
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+ grp->bb_counters[i] = 0;
+ ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp);
+
+ KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize),
+ 0);
+ mbt_validate_group_info(test, grp, e4b->bd_info);
+}
+
+static void test_mb_mark_used(struct kunit *test)
+{
+ struct ext4_buddy e4b;
+ struct super_block *sb = (struct super_block *)test->priv;
+ void *bitmap, *buddy;
+ struct ext4_group_info *grp;
+ int ret;
+ struct test_range ranges[TEST_RANGE_COUNT];
+ int i;
+
+ /* buddy cache assumes that each page contains at least one block */
+ if (sb->s_blocksize > PAGE_SIZE)
+ kunit_skip(test, "blocksize exceeds pagesize");
+
+ bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap);
+ buddy = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy);
+ grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
+ bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
+
+ ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ grp->bb_free = EXT4_CLUSTERS_PER_GROUP(sb);
+ grp->bb_largest_free_order = -1;
+ grp->bb_avg_fragment_size_order = -1;
+ INIT_LIST_HEAD(&grp->bb_largest_free_order_node);
+ INIT_LIST_HEAD(&grp->bb_avg_fragment_size_node);
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+ for (i = 0; i < TEST_RANGE_COUNT; i++)
+ test_mb_mark_used_range(test, &e4b, ranges[i].start,
+ ranges[i].len, bitmap, buddy, grp);
+
+ ext4_mb_unload_buddy(&e4b);
+}
+
+static void
+test_mb_free_blocks_range(struct kunit *test, struct ext4_buddy *e4b,
+ ext4_grpblk_t start, ext4_grpblk_t len, void *bitmap,
+ void *buddy, struct ext4_group_info *grp)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ int i;
+
+ /* mb_free_blocks will WARN if len is 0 */
+ if (len == 0)
+ return;
+
+ ext4_lock_group(sb, e4b->bd_group);
+ mb_free_blocks(NULL, e4b, start, len);
+ ext4_unlock_group(sb, e4b->bd_group);
+
+ mb_clear_bits(bitmap, start, len);
+ /* bypass bb_free validatoin in ext4_mb_generate_buddy */
+ grp->bb_free += len;
+ memset(buddy, 0xff, sb->s_blocksize);
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+ grp->bb_counters[i] = 0;
+ ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp);
+
+ KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize),
+ 0);
+ mbt_validate_group_info(test, grp, e4b->bd_info);
+
+}
+
+static void test_mb_free_blocks(struct kunit *test)
+{
+ struct ext4_buddy e4b;
+ struct super_block *sb = (struct super_block *)test->priv;
+ void *bitmap, *buddy;
+ struct ext4_group_info *grp;
+ struct ext4_free_extent ex;
+ int ret;
+ int i;
+ struct test_range ranges[TEST_RANGE_COUNT];
+
+ /* buddy cache assumes that each page contains at least one block */
+ if (sb->s_blocksize > PAGE_SIZE)
+ kunit_skip(test, "blocksize exceeds pagesize");
+
+ bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap);
+ buddy = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy);
+ grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
+ bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
+
+ ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ ex.fe_start = 0;
+ ex.fe_len = EXT4_CLUSTERS_PER_GROUP(sb);
+ ex.fe_group = TEST_GOAL_GROUP;
+
+ ext4_lock_group(sb, TEST_GOAL_GROUP);
+ mb_mark_used(&e4b, &ex);
+ ext4_unlock_group(sb, TEST_GOAL_GROUP);
+
+ grp->bb_free = 0;
+ grp->bb_largest_free_order = -1;
+ grp->bb_avg_fragment_size_order = -1;
+ INIT_LIST_HEAD(&grp->bb_largest_free_order_node);
+ INIT_LIST_HEAD(&grp->bb_avg_fragment_size_node);
+ memset(bitmap, 0xff, sb->s_blocksize);
+
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+ for (i = 0; i < TEST_RANGE_COUNT; i++)
+ test_mb_free_blocks_range(test, &e4b, ranges[i].start,
+ ranges[i].len, bitmap, buddy, grp);
+
+ ext4_mb_unload_buddy(&e4b);
+}
+
+#define COUNT_FOR_ESTIMATE 100000
+static void test_mb_mark_used_cost(struct kunit *test)
+{
+ struct ext4_buddy e4b;
+ struct super_block *sb = (struct super_block *)test->priv;
+ struct ext4_free_extent ex;
+ int ret;
+ struct test_range ranges[TEST_RANGE_COUNT];
+ int i, j;
+ unsigned long start, end, all = 0;
+
+ /* buddy cache assumes that each page contains at least one block */
+ if (sb->s_blocksize > PAGE_SIZE)
+ kunit_skip(test, "blocksize exceeds pagesize");
+
+ ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ ex.fe_group = TEST_GOAL_GROUP;
+ for (j = 0; j < COUNT_FOR_ESTIMATE; j++) {
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+ start = jiffies;
+ for (i = 0; i < TEST_RANGE_COUNT; i++) {
+ if (ranges[i].len == 0)
+ continue;
+
+ ex.fe_start = ranges[i].start;
+ ex.fe_len = ranges[i].len;
+ ext4_lock_group(sb, TEST_GOAL_GROUP);
+ mb_mark_used(&e4b, &ex);
+ ext4_unlock_group(sb, TEST_GOAL_GROUP);
+ }
+ end = jiffies;
+ all += (end - start);
+
+ for (i = 0; i < TEST_RANGE_COUNT; i++) {
+ if (ranges[i].len == 0)
+ continue;
+
+ ext4_lock_group(sb, TEST_GOAL_GROUP);
+ mb_free_blocks(NULL, &e4b, ranges[i].start,
+ ranges[i].len);
+ ext4_unlock_group(sb, TEST_GOAL_GROUP);
+ }
+ }
+
+ kunit_info(test, "costed jiffies %lu\n", all);
+ ext4_mb_unload_buddy(&e4b);
+}
+
+static const struct mbt_ext4_block_layout mbt_test_layouts[] = {
+ {
+ .blocksize_bits = 10,
+ .cluster_bits = 3,
+ .blocks_per_group = 8192,
+ .group_count = 4,
+ .desc_size = 64,
+ },
+ {
+ .blocksize_bits = 12,
+ .cluster_bits = 3,
+ .blocks_per_group = 8192,
+ .group_count = 4,
+ .desc_size = 64,
+ },
+ {
+ .blocksize_bits = 16,
+ .cluster_bits = 3,
+ .blocks_per_group = 8192,
+ .group_count = 4,
+ .desc_size = 64,
+ },
+};
+
+static void mbt_show_layout(const struct mbt_ext4_block_layout *layout,
+ char *desc)
+{
+ snprintf(desc, KUNIT_PARAM_DESC_SIZE, "block_bits=%d cluster_bits=%d "
+ "blocks_per_group=%d group_count=%d desc_size=%d\n",
+ layout->blocksize_bits, layout->cluster_bits,
+ layout->blocks_per_group, layout->group_count,
+ layout->desc_size);
+}
+KUNIT_ARRAY_PARAM(mbt_layouts, mbt_test_layouts, mbt_show_layout);
+
+static struct kunit_case mbt_test_cases[] = {
+ KUNIT_CASE_PARAM(test_new_blocks_simple, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_free_blocks_simple, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_mb_generate_buddy, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_mb_mark_used, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_mb_free_blocks, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_mark_diskspace_used, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM_ATTR(test_mb_mark_used_cost, mbt_layouts_gen_params,
+ { .speed = KUNIT_SPEED_SLOW }),
+ {}
+};
+
+static struct kunit_suite mbt_test_suite = {
+ .name = "ext4_mballoc_test",
+ .init = mbt_kunit_init,
+ .exit = mbt_kunit_exit,
+ .test_cases = mbt_test_cases,
+};
+
+kunit_test_suites(&mbt_test_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 8a9f8c95c6f1..76331cdb4cb5 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -18,6 +18,7 @@
#include <linux/backing-dev.h>
#include <linux/freezer.h>
#include <trace/events/ext4.h>
+#include <kunit/static_stub.h>
/*
* MUSTDO:
@@ -563,14 +564,14 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
+ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
ext4_grp_locked_error(sb, e4b->bd_group,
inode ? inode->i_ino : 0,
blocknr,
"freeing block already freed "
"(bit %u)",
first + i);
- ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
- EXT4_GROUP_INFO_BBITMAP_CORRUPT);
}
mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
}
@@ -676,7 +677,7 @@ do { \
} \
} while (0)
-static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
+static void __mb_check_buddy(struct ext4_buddy *e4b, char *file,
const char *function, int line)
{
struct super_block *sb = e4b->bd_sb;
@@ -695,7 +696,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
void *buddy2;
if (e4b->bd_info->bb_check_counter++ % 10)
- return 0;
+ return;
while (order > 1) {
buddy = mb_find_buddy(e4b, order, &max);
@@ -757,7 +758,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
grp = ext4_get_group_info(sb, e4b->bd_group);
if (!grp)
- return NULL;
+ return;
list_for_each(cur, &grp->bb_prealloc_list) {
ext4_group_t groupnr;
struct ext4_prealloc_space *pa;
@@ -767,7 +768,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
for (i = 0; i < pa->pa_len; i++)
MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
}
- return 0;
}
#undef MB_CHECK_ASSERT
#define mb_check_buddy(e4b) __mb_check_buddy(e4b, \
@@ -841,30 +841,30 @@ static void
mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- int new_order;
+ int new, old;
- if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_fragments == 0)
+ if (!test_opt2(sb, MB_OPTIMIZE_SCAN))
return;
- new_order = mb_avg_fragment_size_order(sb,
- grp->bb_free / grp->bb_fragments);
- if (new_order == grp->bb_avg_fragment_size_order)
+ old = grp->bb_avg_fragment_size_order;
+ new = grp->bb_fragments == 0 ? -1 :
+ mb_avg_fragment_size_order(sb, grp->bb_free / grp->bb_fragments);
+ if (new == old)
return;
- if (grp->bb_avg_fragment_size_order != -1) {
- write_lock(&sbi->s_mb_avg_fragment_size_locks[
- grp->bb_avg_fragment_size_order]);
+ if (old >= 0) {
+ write_lock(&sbi->s_mb_avg_fragment_size_locks[old]);
list_del(&grp->bb_avg_fragment_size_node);
- write_unlock(&sbi->s_mb_avg_fragment_size_locks[
- grp->bb_avg_fragment_size_order]);
+ write_unlock(&sbi->s_mb_avg_fragment_size_locks[old]);
+ }
+
+ grp->bb_avg_fragment_size_order = new;
+ if (new >= 0) {
+ write_lock(&sbi->s_mb_avg_fragment_size_locks[new]);
+ list_add_tail(&grp->bb_avg_fragment_size_node,
+ &sbi->s_mb_avg_fragment_size[new]);
+ write_unlock(&sbi->s_mb_avg_fragment_size_locks[new]);
}
- grp->bb_avg_fragment_size_order = new_order;
- write_lock(&sbi->s_mb_avg_fragment_size_locks[
- grp->bb_avg_fragment_size_order]);
- list_add_tail(&grp->bb_avg_fragment_size_node,
- &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]);
- write_unlock(&sbi->s_mb_avg_fragment_size_locks[
- grp->bb_avg_fragment_size_order]);
}
/*
@@ -872,7 +872,7 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
* cr level needs an update.
*/
static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac,
- enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+ enum criteria *new_cr, ext4_group_t *group)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_group_info *iter;
@@ -946,7 +946,7 @@ ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int o
* order. Updates *new_cr if cr level needs an update.
*/
static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac,
- enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+ enum criteria *new_cr, ext4_group_t *group)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_group_info *grp = NULL;
@@ -991,7 +991,7 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *
* much and fall to CR_GOAL_LEN_SLOW in that case.
*/
static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac,
- enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+ enum criteria *new_cr, ext4_group_t *group)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_group_info *grp = NULL;
@@ -1080,23 +1080,11 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac)
}
/*
- * Return next linear group for allocation. If linear traversal should not be
- * performed, this function just returns the same group
+ * Return next linear group for allocation.
*/
static ext4_group_t
-next_linear_group(struct ext4_allocation_context *ac, ext4_group_t group,
- ext4_group_t ngroups)
+next_linear_group(ext4_group_t group, ext4_group_t ngroups)
{
- if (!should_optimize_scan(ac))
- goto inc_and_return;
-
- if (ac->ac_groups_linear_remaining) {
- ac->ac_groups_linear_remaining--;
- goto inc_and_return;
- }
-
- return group;
-inc_and_return:
/*
* Artificially restricted ngroups for non-extent
* files makes group > ngroups possible on first loop.
@@ -1122,21 +1110,33 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
{
*new_cr = ac->ac_criteria;
- if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) {
- *group = next_linear_group(ac, *group, ngroups);
+ if (!should_optimize_scan(ac)) {
+ *group = next_linear_group(*group, ngroups);
+ return;
+ }
+
+ /*
+ * Optimized scanning can return non adjacent groups which can cause
+ * seek overhead for rotational disks. So try few linear groups before
+ * trying optimized scan.
+ */
+ if (ac->ac_groups_linear_remaining) {
+ *group = next_linear_group(*group, ngroups);
+ ac->ac_groups_linear_remaining--;
return;
}
if (*new_cr == CR_POWER2_ALIGNED) {
- ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group, ngroups);
+ ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group);
} else if (*new_cr == CR_GOAL_LEN_FAST) {
- ext4_mb_choose_next_group_goal_fast(ac, new_cr, group, ngroups);
+ ext4_mb_choose_next_group_goal_fast(ac, new_cr, group);
} else if (*new_cr == CR_BEST_AVAIL_LEN) {
- ext4_mb_choose_next_group_best_avail(ac, new_cr, group, ngroups);
+ ext4_mb_choose_next_group_best_avail(ac, new_cr, group);
} else {
/*
- * TODO: For CR=2, we can arrange groups in an rb tree sorted by
- * bb_free. But until that happens, we should never come here.
+ * TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an
+ * rb tree sorted by bb_free. But until that happens, we should
+ * never come here.
*/
WARN_ON(1);
}
@@ -1150,33 +1150,28 @@ static void
mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- int i;
+ int new, old = grp->bb_largest_free_order;
- for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--)
- if (grp->bb_counters[i] > 0)
+ for (new = MB_NUM_ORDERS(sb) - 1; new >= 0; new--)
+ if (grp->bb_counters[new] > 0)
break;
+
/* No need to move between order lists? */
- if (!test_opt2(sb, MB_OPTIMIZE_SCAN) ||
- i == grp->bb_largest_free_order) {
- grp->bb_largest_free_order = i;
+ if (new == old)
return;
- }
- if (grp->bb_largest_free_order >= 0) {
- write_lock(&sbi->s_mb_largest_free_orders_locks[
- grp->bb_largest_free_order]);
+ if (old >= 0 && !list_empty(&grp->bb_largest_free_order_node)) {
+ write_lock(&sbi->s_mb_largest_free_orders_locks[old]);
list_del_init(&grp->bb_largest_free_order_node);
- write_unlock(&sbi->s_mb_largest_free_orders_locks[
- grp->bb_largest_free_order]);
+ write_unlock(&sbi->s_mb_largest_free_orders_locks[old]);
}
- grp->bb_largest_free_order = i;
- if (grp->bb_largest_free_order >= 0 && grp->bb_free) {
- write_lock(&sbi->s_mb_largest_free_orders_locks[
- grp->bb_largest_free_order]);
+
+ grp->bb_largest_free_order = new;
+ if (test_opt2(sb, MB_OPTIMIZE_SCAN) && new >= 0 && grp->bb_free) {
+ write_lock(&sbi->s_mb_largest_free_orders_locks[new]);
list_add_tail(&grp->bb_largest_free_order_node,
- &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]);
- write_unlock(&sbi->s_mb_largest_free_orders_locks[
- grp->bb_largest_free_order]);
+ &sbi->s_mb_largest_free_orders[new]);
+ write_unlock(&sbi->s_mb_largest_free_orders_locks[new]);
}
}
@@ -1274,7 +1269,7 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b)
* for this page; do not hold this lock when calling this routine!
*/
-static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
+static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp)
{
ext4_group_t ngroups;
unsigned int blocksize;
@@ -1292,13 +1287,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
char *bitmap;
struct ext4_group_info *grinfo;
- inode = page->mapping->host;
+ inode = folio->mapping->host;
sb = inode->i_sb;
ngroups = ext4_get_groups_count(sb);
blocksize = i_blocksize(inode);
blocks_per_page = PAGE_SIZE / blocksize;
- mb_debug(sb, "init page %lu\n", page->index);
+ mb_debug(sb, "init folio %lu\n", folio->index);
groups_per_page = blocks_per_page >> 1;
if (groups_per_page == 0)
@@ -1313,9 +1308,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
} else
bh = &bhs;
- first_group = page->index * blocks_per_page / 2;
+ first_group = folio->index * blocks_per_page / 2;
- /* read all groups the page covers into the cache */
+ /* read all groups the folio covers into the cache */
for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
if (group >= ngroups)
break;
@@ -1326,10 +1321,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
/*
* If page is uptodate then we came here after online resize
* which added some new uninitialized group info structs, so
- * we must skip all initialized uptodate buddies on the page,
+ * we must skip all initialized uptodate buddies on the folio,
* which may be currently in use by an allocating task.
*/
- if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
+ if (folio_test_uptodate(folio) &&
+ !EXT4_MB_GRP_NEED_INIT(grinfo)) {
bh[i] = NULL;
continue;
}
@@ -1353,7 +1349,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
err = err2;
}
- first_block = page->index * blocks_per_page;
+ first_block = folio->index * blocks_per_page;
for (i = 0; i < blocks_per_page; i++) {
group = (first_block + i) >> 1;
if (group >= ngroups)
@@ -1374,7 +1370,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
* above
*
*/
- data = page_address(page) + (i * blocksize);
+ data = folio_address(folio) + (i * blocksize);
bitmap = bh[group - first_group]->b_data;
/*
@@ -1389,8 +1385,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
if ((first_block + i) & 1) {
/* this is block of buddy */
BUG_ON(incore == NULL);
- mb_debug(sb, "put buddy for group %u in page %lu/%x\n",
- group, page->index, i * blocksize);
+ mb_debug(sb, "put buddy for group %u in folio %lu/%x\n",
+ group, folio->index, i * blocksize);
trace_ext4_mb_buddy_bitmap_load(sb, group);
grinfo->bb_fragments = 0;
memset(grinfo->bb_counters, 0,
@@ -1408,8 +1404,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
} else {
/* this is block of bitmap */
BUG_ON(incore != NULL);
- mb_debug(sb, "put bitmap for group %u in page %lu/%x\n",
- group, page->index, i * blocksize);
+ mb_debug(sb, "put bitmap for group %u in folio %lu/%x\n",
+ group, folio->index, i * blocksize);
trace_ext4_mb_bitmap_load(sb, group);
/* see comments in ext4_mb_put_pa() */
@@ -1427,7 +1423,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
incore = data;
}
}
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
out:
if (bh) {
@@ -1443,7 +1439,7 @@ out:
* Lock the buddy and bitmap pages. This make sure other parallel init_group
* on the same buddy page doesn't happen whild holding the buddy page lock.
* Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
- * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
+ * are on the same page e4b->bd_buddy_folio is NULL and return value is 0.
*/
static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
@@ -1451,10 +1447,10 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
int block, pnum, poff;
int blocks_per_page;
- struct page *page;
+ struct folio *folio;
- e4b->bd_buddy_page = NULL;
- e4b->bd_bitmap_page = NULL;
+ e4b->bd_buddy_folio = NULL;
+ e4b->bd_bitmap_folio = NULL;
blocks_per_page = PAGE_SIZE / sb->s_blocksize;
/*
@@ -1465,37 +1461,38 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
block = group * 2;
pnum = block / blocks_per_page;
poff = block % blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, gfp);
- if (!page)
- return -ENOMEM;
- BUG_ON(page->mapping != inode->i_mapping);
- e4b->bd_bitmap_page = page;
- e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
+ folio = __filemap_get_folio(inode->i_mapping, pnum,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ BUG_ON(folio->mapping != inode->i_mapping);
+ e4b->bd_bitmap_folio = folio;
+ e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize);
if (blocks_per_page >= 2) {
/* buddy and bitmap are on the same page */
return 0;
}
- block++;
- pnum = block / blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, gfp);
- if (!page)
- return -ENOMEM;
- BUG_ON(page->mapping != inode->i_mapping);
- e4b->bd_buddy_page = page;
+ /* blocks_per_page == 1, hence we need another page for the buddy */
+ folio = __filemap_get_folio(inode->i_mapping, block + 1,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ BUG_ON(folio->mapping != inode->i_mapping);
+ e4b->bd_buddy_folio = folio;
return 0;
}
static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
{
- if (e4b->bd_bitmap_page) {
- unlock_page(e4b->bd_bitmap_page);
- put_page(e4b->bd_bitmap_page);
+ if (e4b->bd_bitmap_folio) {
+ folio_unlock(e4b->bd_bitmap_folio);
+ folio_put(e4b->bd_bitmap_folio);
}
- if (e4b->bd_buddy_page) {
- unlock_page(e4b->bd_buddy_page);
- put_page(e4b->bd_buddy_page);
+ if (e4b->bd_buddy_folio) {
+ folio_unlock(e4b->bd_buddy_folio);
+ folio_put(e4b->bd_buddy_folio);
}
}
@@ -1510,7 +1507,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
struct ext4_group_info *this_grp;
struct ext4_buddy e4b;
- struct page *page;
+ struct folio *folio;
int ret = 0;
might_sleep();
@@ -1537,16 +1534,16 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
goto err;
}
- page = e4b.bd_bitmap_page;
- ret = ext4_mb_init_cache(page, NULL, gfp);
+ folio = e4b.bd_bitmap_folio;
+ ret = ext4_mb_init_cache(folio, NULL, gfp);
if (ret)
goto err;
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
ret = -EIO;
goto err;
}
- if (e4b.bd_buddy_page == NULL) {
+ if (e4b.bd_buddy_folio == NULL) {
/*
* If both the bitmap and buddy are in
* the same page we don't need to force
@@ -1556,11 +1553,11 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
goto err;
}
/* init buddy cache */
- page = e4b.bd_buddy_page;
- ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp);
+ folio = e4b.bd_buddy_folio;
+ ret = ext4_mb_init_cache(folio, e4b.bd_bitmap, gfp);
if (ret)
goto err;
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
ret = -EIO;
goto err;
}
@@ -1582,7 +1579,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
int block;
int pnum;
int poff;
- struct page *page;
+ struct folio *folio;
int ret;
struct ext4_group_info *grp;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1600,8 +1597,8 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
e4b->bd_info = grp;
e4b->bd_sb = sb;
e4b->bd_group = group;
- e4b->bd_buddy_page = NULL;
- e4b->bd_bitmap_page = NULL;
+ e4b->bd_buddy_folio = NULL;
+ e4b->bd_bitmap_folio = NULL;
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
/*
@@ -1622,102 +1619,103 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
pnum = block / blocks_per_page;
poff = block % blocks_per_page;
- /* we could use find_or_create_page(), but it locks page
- * what we'd like to avoid in fast path ... */
- page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
- if (page == NULL || !PageUptodate(page)) {
- if (page)
+ /* Avoid locking the folio in the fast path ... */
+ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0);
+ if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
+ if (!IS_ERR(folio))
/*
- * drop the page reference and try
- * to get the page with lock. If we
+ * drop the folio reference and try
+ * to get the folio with lock. If we
* are not uptodate that implies
- * somebody just created the page but
- * is yet to initialize the same. So
+ * somebody just created the folio but
+ * is yet to initialize it. So
* wait for it to initialize.
*/
- put_page(page);
- page = find_or_create_page(inode->i_mapping, pnum, gfp);
- if (page) {
- if (WARN_RATELIMIT(page->mapping != inode->i_mapping,
- "ext4: bitmap's paging->mapping != inode->i_mapping\n")) {
+ folio_put(folio);
+ folio = __filemap_get_folio(inode->i_mapping, pnum,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
+ if (!IS_ERR(folio)) {
+ if (WARN_RATELIMIT(folio->mapping != inode->i_mapping,
+ "ext4: bitmap's mapping != inode->i_mapping\n")) {
/* should never happen */
- unlock_page(page);
+ folio_unlock(folio);
ret = -EINVAL;
goto err;
}
- if (!PageUptodate(page)) {
- ret = ext4_mb_init_cache(page, NULL, gfp);
+ if (!folio_test_uptodate(folio)) {
+ ret = ext4_mb_init_cache(folio, NULL, gfp);
if (ret) {
- unlock_page(page);
+ folio_unlock(folio);
goto err;
}
- mb_cmp_bitmaps(e4b, page_address(page) +
+ mb_cmp_bitmaps(e4b, folio_address(folio) +
(poff * sb->s_blocksize));
}
- unlock_page(page);
+ folio_unlock(folio);
}
}
- if (page == NULL) {
- ret = -ENOMEM;
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
goto err;
}
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
ret = -EIO;
goto err;
}
- /* Pages marked accessed already */
- e4b->bd_bitmap_page = page;
- e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
+ /* Folios marked accessed already */
+ e4b->bd_bitmap_folio = folio;
+ e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize);
block++;
pnum = block / blocks_per_page;
poff = block % blocks_per_page;
- page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
- if (page == NULL || !PageUptodate(page)) {
- if (page)
- put_page(page);
- page = find_or_create_page(inode->i_mapping, pnum, gfp);
- if (page) {
- if (WARN_RATELIMIT(page->mapping != inode->i_mapping,
- "ext4: buddy bitmap's page->mapping != inode->i_mapping\n")) {
+ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0);
+ if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
+ if (!IS_ERR(folio))
+ folio_put(folio);
+ folio = __filemap_get_folio(inode->i_mapping, pnum,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
+ if (!IS_ERR(folio)) {
+ if (WARN_RATELIMIT(folio->mapping != inode->i_mapping,
+ "ext4: buddy bitmap's mapping != inode->i_mapping\n")) {
/* should never happen */
- unlock_page(page);
+ folio_unlock(folio);
ret = -EINVAL;
goto err;
}
- if (!PageUptodate(page)) {
- ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
+ if (!folio_test_uptodate(folio)) {
+ ret = ext4_mb_init_cache(folio, e4b->bd_bitmap,
gfp);
if (ret) {
- unlock_page(page);
+ folio_unlock(folio);
goto err;
}
}
- unlock_page(page);
+ folio_unlock(folio);
}
}
- if (page == NULL) {
- ret = -ENOMEM;
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
goto err;
}
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
ret = -EIO;
goto err;
}
- /* Pages marked accessed already */
- e4b->bd_buddy_page = page;
- e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
+ /* Folios marked accessed already */
+ e4b->bd_buddy_folio = folio;
+ e4b->bd_buddy = folio_address(folio) + (poff * sb->s_blocksize);
return 0;
err:
- if (page)
- put_page(page);
- if (e4b->bd_bitmap_page)
- put_page(e4b->bd_bitmap_page);
+ if (!IS_ERR_OR_NULL(folio))
+ folio_put(folio);
+ if (e4b->bd_bitmap_folio)
+ folio_put(e4b->bd_bitmap_folio);
e4b->bd_buddy = NULL;
e4b->bd_bitmap = NULL;
@@ -1732,10 +1730,10 @@ static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
{
- if (e4b->bd_bitmap_page)
- put_page(e4b->bd_bitmap_page);
- if (e4b->bd_buddy_page)
- put_page(e4b->bd_buddy_page);
+ if (e4b->bd_bitmap_folio)
+ folio_put(e4b->bd_bitmap_folio);
+ if (e4b->bd_buddy_folio)
+ folio_put(e4b->bd_buddy_folio);
}
@@ -1937,12 +1935,12 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
blocknr += EXT4_C2B(sbi, block);
+ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
ext4_grp_locked_error(sb, e4b->bd_group,
inode ? inode->i_ino : 0, blocknr,
"freeing already freed block (bit %u); block bitmap corrupt.",
block);
- ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
- EXT4_GROUP_INFO_BBITMAP_CORRUPT);
return;
}
@@ -1984,8 +1982,7 @@ check:
static int mb_find_extent(struct ext4_buddy *e4b, int block,
int needed, struct ext4_free_extent *ex)
{
- int next = block;
- int max, order;
+ int max, order, next;
void *buddy;
assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
@@ -2003,16 +2000,12 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block,
/* find actual order */
order = mb_find_order_for_block(e4b, block);
- block = block >> order;
- ex->fe_len = 1 << order;
- ex->fe_start = block << order;
+ ex->fe_len = (1 << order) - (block & ((1 << order) - 1));
+ ex->fe_start = block;
ex->fe_group = e4b->bd_group;
- /* calc difference from given start */
- next = next - ex->fe_start;
- ex->fe_len -= next;
- ex->fe_start += next;
+ block = block >> order;
while (needed > ex->fe_len &&
mb_find_buddy(e4b, order, &max)) {
@@ -2050,13 +2043,12 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
int ord;
int mlen = 0;
int max = 0;
- int cur;
int start = ex->fe_start;
int len = ex->fe_len;
unsigned ret = 0;
int len0 = len;
void *buddy;
- bool split = false;
+ int ord_start, ord_end;
BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
BUG_ON(e4b->bd_group != ex->fe_group);
@@ -2081,16 +2073,12 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
/* let's maintain buddy itself */
while (len) {
- if (!split)
- ord = mb_find_order_for_block(e4b, start);
+ ord = mb_find_order_for_block(e4b, start);
if (((start >> ord) << ord) == start && len >= (1 << ord)) {
/* the whole chunk may be allocated at once! */
mlen = 1 << ord;
- if (!split)
- buddy = mb_find_buddy(e4b, ord, &max);
- else
- split = false;
+ buddy = mb_find_buddy(e4b, ord, &max);
BUG_ON((start >> ord) >= max);
mb_set_bit(start >> ord, buddy);
e4b->bd_info->bb_counters[ord]--;
@@ -2104,20 +2092,29 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
if (ret == 0)
ret = len | (ord << 16);
- /* we have to split large buddy */
BUG_ON(ord <= 0);
buddy = mb_find_buddy(e4b, ord, &max);
mb_set_bit(start >> ord, buddy);
e4b->bd_info->bb_counters[ord]--;
- ord--;
- cur = (start >> ord) & ~1U;
- buddy = mb_find_buddy(e4b, ord, &max);
- mb_clear_bit(cur, buddy);
- mb_clear_bit(cur + 1, buddy);
- e4b->bd_info->bb_counters[ord]++;
- e4b->bd_info->bb_counters[ord]++;
- split = true;
+ ord_start = (start >> ord) << ord;
+ ord_end = ord_start + (1 << ord);
+ /* first chunk */
+ if (start > ord_start)
+ ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy,
+ ord_start, start - ord_start,
+ e4b->bd_info);
+
+ /* last chunk */
+ if (start + len < ord_end) {
+ ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy,
+ start + len,
+ ord_end - (start + len),
+ e4b->bd_info);
+ break;
+ }
+ len = start + len - ord_end;
+ start = ord_end;
}
mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
@@ -2159,10 +2156,10 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
* double allocate blocks. The reference is dropped
* in ext4_mb_release_context
*/
- ac->ac_bitmap_page = e4b->bd_bitmap_page;
- get_page(ac->ac_bitmap_page);
- ac->ac_buddy_page = e4b->bd_buddy_page;
- get_page(ac->ac_buddy_page);
+ ac->ac_bitmap_folio = e4b->bd_bitmap_folio;
+ folio_get(ac->ac_bitmap_folio);
+ ac->ac_buddy_folio = e4b->bd_buddy_folio;
+ folio_get(ac->ac_buddy_folio);
/* store last allocated for subsequent stream allocation */
if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
spin_lock(&sbi->s_md_lock);
@@ -2354,7 +2351,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
ex.fe_logical = 0xDEADFA11; /* debug value */
if (max >= ac->ac_g_ex.fe_len &&
- ac->ac_g_ex.fe_len == EXT4_B2C(sbi, sbi->s_stripe)) {
+ ac->ac_g_ex.fe_len == EXT4_NUM_B2C(sbi, sbi->s_stripe)) {
ext4_fsblk_t start;
start = ext4_grp_offs_to_block(ac->ac_sb, &ex);
@@ -2415,12 +2412,12 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
k = mb_find_next_zero_bit(buddy, max, 0);
if (k >= max) {
+ ext4_mark_group_bitmap_corrupted(ac->ac_sb,
+ e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0,
"%d free clusters of order %d. But found 0",
grp->bb_counters[i], i);
- ext4_mark_group_bitmap_corrupted(ac->ac_sb,
- e4b->bd_group,
- EXT4_GROUP_INFO_BBITMAP_CORRUPT);
break;
}
ac->ac_found++;
@@ -2471,12 +2468,12 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
* free blocks even though group info says we
* have free blocks
*/
+ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
"%d free clusters as per "
"group info. But bitmap says 0",
free);
- ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
- EXT4_GROUP_INFO_BBITMAP_CORRUPT);
break;
}
@@ -2502,12 +2499,12 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
if (WARN_ON(ex.fe_len <= 0))
break;
if (free < ex.fe_len) {
+ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
"%d free clusters as per "
"group info. But got %d blocks",
free, ex.fe_len);
- ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
- EXT4_GROUP_INFO_BBITMAP_CORRUPT);
/*
* The number of free blocks differs. This mostly
* indicate that the bitmap is corrupt. So exit
@@ -2551,7 +2548,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
do_div(a, sbi->s_stripe);
i = (a * sbi->s_stripe) - first_group_block;
- stripe = EXT4_B2C(sbi, sbi->s_stripe);
+ stripe = EXT4_NUM_B2C(sbi, sbi->s_stripe);
i = EXT4_B2C(sbi, i);
while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
if (!mb_test_bit(i, bitmap)) {
@@ -2685,7 +2682,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
int ret;
/*
- * cr=CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic
+ * CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic
* search to find large good chunks almost for free. If buddy
* data is not ready, then this optimization makes no sense. But
* we never skip the first block group in a flex_bg, since this
@@ -2866,6 +2863,7 @@ repeat:
group = ac->ac_g_ex.fe_group;
ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
prefetch_grp = group;
+ nr = 0;
for (i = 0, new_cr = cr; i < ngroups; i++,
ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) {
@@ -2924,14 +2922,21 @@ repeat:
ac->ac_groups_scanned++;
if (cr == CR_POWER2_ALIGNED)
ext4_mb_simple_scan_group(ac, &e4b);
- else if ((cr == CR_GOAL_LEN_FAST ||
- cr == CR_BEST_AVAIL_LEN) &&
- sbi->s_stripe &&
- !(ac->ac_g_ex.fe_len %
- EXT4_B2C(sbi, sbi->s_stripe)))
- ext4_mb_scan_aligned(ac, &e4b);
- else
- ext4_mb_complex_scan_group(ac, &e4b);
+ else {
+ bool is_stripe_aligned =
+ (sbi->s_stripe >=
+ sbi->s_cluster_ratio) &&
+ !(ac->ac_g_ex.fe_len %
+ EXT4_NUM_B2C(sbi, sbi->s_stripe));
+
+ if ((cr == CR_GOAL_LEN_FAST ||
+ cr == CR_BEST_AVAIL_LEN) &&
+ is_stripe_aligned)
+ ext4_mb_scan_aligned(ac, &e4b);
+
+ if (ac->ac_status == AC_STATUS_CONTINUE)
+ ext4_mb_complex_scan_group(ac, &e4b);
+ }
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
@@ -3020,8 +3025,8 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
{
struct super_block *sb = pde_data(file_inode(seq->file));
ext4_group_t group = (ext4_group_t) ((unsigned long) v);
- int i;
- int err, buddy_loaded = 0;
+ int i, err;
+ char nbuf[16];
struct ext4_buddy e4b;
struct ext4_group_info *grinfo;
unsigned char blocksize_bits = min_t(unsigned char,
@@ -3048,17 +3053,17 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err) {
- seq_printf(seq, "#%-5u: I/O error\n", group);
+ seq_printf(seq, "#%-5u: %s\n", group, ext4_decode_error(NULL, err, nbuf));
return 0;
}
- buddy_loaded = 1;
+ ext4_mb_unload_buddy(&e4b);
}
+ /*
+ * We care only about free space counters in the group info and
+ * these are safe to access even after the buddy has been unloaded
+ */
memcpy(&sg, grinfo, i);
-
- if (buddy_loaded)
- ext4_mb_unload_buddy(&e4b);
-
seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
sg.info.bb_fragments, sg.info.bb_first_free);
for (i = 0; i <= 13; i++)
@@ -3067,8 +3072,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
seq_puts(seq, " ]");
if (EXT4_MB_GRP_BBITMAP_CORRUPT(&sg.info))
seq_puts(seq, " Block bitmap corrupted!");
- seq_puts(seq, "\n");
-
+ seq_putc(seq, '\n');
return 0;
}
@@ -3191,7 +3195,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
}
static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos)
-__acquires(&EXT4_SB(sb)->s_mb_rb_lock)
{
struct super_block *sb = pde_data(file_inode(seq->file));
unsigned long position;
@@ -3445,10 +3448,11 @@ static int ext4_mb_init_backend(struct super_block *sb)
}
if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
sbi->s_mb_prefetch = ext4_get_groups_count(sb);
- /* now many real IOs to prefetch within a single allocation at cr=0
- * given cr=0 is an CPU-related optimization we shouldn't try to
- * load too many groups, at some point we should start to use what
- * we've got in memory.
+ /*
+ * now many real IOs to prefetch within a single allocation at
+ * CR_POWER2_ALIGNED. Given CR_POWER2_ALIGNED is an CPU-related
+ * optimization we shouldn't try to load too many groups, at some point
+ * we should start to use what we've got in memory.
* with an average random access time 5ms, it'd take a second to get
* 200 groups (* N with flex_bg), so let's make this limit 4
*/
@@ -3662,7 +3666,8 @@ int ext4_mb_init(struct super_block *sb)
spin_lock_init(&sbi->s_md_lock);
sbi->s_mb_free_pending = 0;
- INIT_LIST_HEAD(&sbi->s_freed_data_list);
+ INIT_LIST_HEAD(&sbi->s_freed_data_list[0]);
+ INIT_LIST_HEAD(&sbi->s_freed_data_list[1]);
INIT_LIST_HEAD(&sbi->s_discard_list);
INIT_WORK(&sbi->s_discard_work, ext4_discard_work);
atomic_set(&sbi->s_retry_alloc_pending, 0);
@@ -3698,7 +3703,7 @@ int ext4_mb_init(struct super_block *sb)
*/
if (sbi->s_stripe > 1) {
sbi->s_mb_group_prealloc = roundup(
- sbi->s_mb_group_prealloc, EXT4_B2C(sbi, sbi->s_stripe));
+ sbi->s_mb_group_prealloc, EXT4_NUM_B2C(sbi, sbi->s_stripe));
}
sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
@@ -3757,7 +3762,7 @@ static int ext4_mb_cleanup_pa(struct ext4_group_info *grp)
return count;
}
-int ext4_mb_release(struct super_block *sb)
+void ext4_mb_release(struct super_block *sb)
{
ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t i;
@@ -3833,13 +3838,10 @@ int ext4_mb_release(struct super_block *sb)
}
free_percpu(sbi->s_locality_groups);
-
- return 0;
}
static inline int ext4_issue_discard(struct super_block *sb,
- ext4_group_t block_group, ext4_grpblk_t cluster, int count,
- struct bio **biop)
+ ext4_group_t block_group, ext4_grpblk_t cluster, int count)
{
ext4_fsblk_t discard_block;
@@ -3848,13 +3850,8 @@ static inline int ext4_issue_discard(struct super_block *sb,
count = EXT4_C2B(EXT4_SB(sb), count);
trace_ext4_discard_blocks(sb,
(unsigned long long) discard_block, count);
- if (biop) {
- return __blkdev_issue_discard(sb->s_bdev,
- (sector_t)discard_block << (sb->s_blocksize_bits - 9),
- (sector_t)count << (sb->s_blocksize_bits - 9),
- GFP_NOFS, biop);
- } else
- return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
+
+ return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
}
static void ext4_free_data_in_buddy(struct super_block *sb,
@@ -3893,8 +3890,8 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
/* No more items in the per group rb tree
* balance refcounts from ext4_mb_free_metadata()
*/
- put_page(e4b.bd_buddy_page);
- put_page(e4b.bd_bitmap_page);
+ folio_put(e4b.bd_buddy_folio);
+ folio_put(e4b.bd_bitmap_folio);
}
ext4_unlock_group(sb, entry->efd_group);
ext4_mb_unload_buddy(&e4b);
@@ -3911,19 +3908,10 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_free_data *entry, *tmp;
LIST_HEAD(freed_data_list);
- struct list_head *cut_pos = NULL;
+ struct list_head *s_freed_head = &sbi->s_freed_data_list[commit_tid & 1];
bool wake;
- spin_lock(&sbi->s_md_lock);
- list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) {
- if (entry->efd_tid != commit_tid)
- break;
- cut_pos = &entry->efd_list;
- }
- if (cut_pos)
- list_cut_position(&freed_data_list, &sbi->s_freed_data_list,
- cut_pos);
- spin_unlock(&sbi->s_md_lock);
+ list_replace_init(s_freed_head, &freed_data_list);
list_for_each_entry(entry, &freed_data_list, efd_list)
ext4_free_data_in_buddy(sb, entry);
@@ -3981,6 +3969,111 @@ void ext4_exit_mballoc(void)
ext4_groupinfo_destroy_slabs();
}
+#define EXT4_MB_BITMAP_MARKED_CHECK 0x0001
+#define EXT4_MB_SYNC_UPDATE 0x0002
+static int
+ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state,
+ ext4_group_t group, ext4_grpblk_t blkoff,
+ ext4_grpblk_t len, int flags, ext4_grpblk_t *ret_changed)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct buffer_head *bitmap_bh = NULL;
+ struct ext4_group_desc *gdp;
+ struct buffer_head *gdp_bh;
+ int err;
+ unsigned int i, already, changed = len;
+
+ KUNIT_STATIC_STUB_REDIRECT(ext4_mb_mark_context,
+ handle, sb, state, group, blkoff, len,
+ flags, ret_changed);
+
+ if (ret_changed)
+ *ret_changed = 0;
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(bitmap_bh))
+ return PTR_ERR(bitmap_bh);
+
+ if (handle) {
+ BUFFER_TRACE(bitmap_bh, "getting write access");
+ err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto out_err;
+ }
+
+ err = -EIO;
+ gdp = ext4_get_group_desc(sb, group, &gdp_bh);
+ if (!gdp)
+ goto out_err;
+
+ if (handle) {
+ BUFFER_TRACE(gdp_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sb, gdp_bh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto out_err;
+ }
+
+ ext4_lock_group(sb, group);
+ if (ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb, group, gdp));
+ }
+
+ if (flags & EXT4_MB_BITMAP_MARKED_CHECK) {
+ already = 0;
+ for (i = 0; i < len; i++)
+ if (mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
+ state)
+ already++;
+ changed = len - already;
+ }
+
+ if (state) {
+ mb_set_bits(bitmap_bh->b_data, blkoff, len);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_group_clusters(sb, gdp) - changed);
+ } else {
+ mb_clear_bits(bitmap_bh->b_data, blkoff, len);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_group_clusters(sb, gdp) + changed);
+ }
+
+ ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
+ ext4_unlock_group(sb, group);
+ if (ret_changed)
+ *ret_changed = changed;
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, group);
+ struct flex_groups *fg = sbi_array_rcu_deref(sbi,
+ s_flex_groups, flex_group);
+
+ if (state)
+ atomic64_sub(changed, &fg->free_clusters);
+ else
+ atomic64_add(changed, &fg->free_clusters);
+ }
+
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+ if (err)
+ goto out_err;
+ err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
+ if (err)
+ goto out_err;
+
+ if (flags & EXT4_MB_SYNC_UPDATE) {
+ sync_dirty_buffer(bitmap_bh);
+ sync_dirty_buffer(gdp_bh);
+ }
+
+out_err:
+ brelse(bitmap_bh);
+ return err;
+}
/*
* Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
@@ -3990,13 +4083,13 @@ static noinline_for_stack int
ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
handle_t *handle, unsigned int reserv_clstrs)
{
- struct buffer_head *bitmap_bh = NULL;
struct ext4_group_desc *gdp;
- struct buffer_head *gdp_bh;
struct ext4_sb_info *sbi;
struct super_block *sb;
ext4_fsblk_t block;
int err, len;
+ int flags = 0;
+ ext4_grpblk_t changed;
BUG_ON(ac->ac_status != AC_STATUS_FOUND);
BUG_ON(ac->ac_b_ex.fe_len <= 0);
@@ -4004,32 +4097,13 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
sb = ac->ac_sb;
sbi = EXT4_SB(sb);
- bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
- if (IS_ERR(bitmap_bh)) {
- return PTR_ERR(bitmap_bh);
- }
-
- BUFFER_TRACE(bitmap_bh, "getting write access");
- err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
- EXT4_JTR_NONE);
- if (err)
- goto out_err;
-
- err = -EIO;
- gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
+ gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, NULL);
if (!gdp)
- goto out_err;
-
+ return -EIO;
ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
ext4_free_group_clusters(sb, gdp));
- BUFFER_TRACE(gdp_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE);
- if (err)
- goto out_err;
-
block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
-
len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
@@ -4038,41 +4112,29 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
* Fix the bitmap and return EFSCORRUPTED
* We leak some of the blocks here.
*/
- ext4_lock_group(sb, ac->ac_b_ex.fe_group);
- mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
- ac->ac_b_ex.fe_len);
- ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
- err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+ err = ext4_mb_mark_context(handle, sb, true,
+ ac->ac_b_ex.fe_group,
+ ac->ac_b_ex.fe_start,
+ ac->ac_b_ex.fe_len,
+ 0, NULL);
if (!err)
err = -EFSCORRUPTED;
- goto out_err;
+ return err;
}
- ext4_lock_group(sb, ac->ac_b_ex.fe_group);
#ifdef AGGRESSIVE_CHECK
- {
- int i;
- for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
- BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
- bitmap_bh->b_data));
- }
- }
+ flags |= EXT4_MB_BITMAP_MARKED_CHECK;
#endif
- mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
- ac->ac_b_ex.fe_len);
- if (ext4_has_group_desc_csum(sb) &&
- (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
- gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
- ext4_free_group_clusters_set(sb, gdp,
- ext4_free_clusters_after_init(sb,
- ac->ac_b_ex.fe_group, gdp));
- }
- len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
- ext4_free_group_clusters_set(sb, gdp, len);
- ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
- ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
+ err = ext4_mb_mark_context(handle, sb, true, ac->ac_b_ex.fe_group,
+ ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len,
+ flags, &changed);
+
+ if (err && changed == 0)
+ return err;
- ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+#ifdef AGGRESSIVE_CHECK
+ BUG_ON(changed != ac->ac_b_ex.fe_len);
+#endif
percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
/*
* Now reduce the dirty block count also. Should not go negative
@@ -4082,21 +4144,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
percpu_counter_sub(&sbi->s_dirtyclusters_counter,
reserv_clstrs);
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t flex_group = ext4_flex_group(sbi,
- ac->ac_b_ex.fe_group);
- atomic64_sub(ac->ac_b_ex.fe_len,
- &sbi_array_rcu_deref(sbi, s_flex_groups,
- flex_group)->free_clusters);
- }
-
- err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
- if (err)
- goto out_err;
- err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
-
-out_err:
- brelse(bitmap_bh);
return err;
}
@@ -4105,17 +4152,13 @@ out_err:
* blocks in bitmaps and update counters.
*/
void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
- int len, int state)
+ int len, bool state)
{
- struct buffer_head *bitmap_bh = NULL;
- struct ext4_group_desc *gdp;
- struct buffer_head *gdp_bh;
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_group_t group;
ext4_grpblk_t blkoff;
- int i, err = 0;
- int already;
- unsigned int clen, clen_changed, thisgrp_len;
+ int err = 0;
+ unsigned int clen, thisgrp_len;
while (len > 0) {
ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
@@ -4136,80 +4179,21 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
ext4_error(sb, "Marking blocks in system zone - "
"Block = %llu, len = %u",
block, thisgrp_len);
- bitmap_bh = NULL;
break;
}
- bitmap_bh = ext4_read_block_bitmap(sb, group);
- if (IS_ERR(bitmap_bh)) {
- err = PTR_ERR(bitmap_bh);
- bitmap_bh = NULL;
- break;
- }
-
- err = -EIO;
- gdp = ext4_get_group_desc(sb, group, &gdp_bh);
- if (!gdp)
- break;
-
- ext4_lock_group(sb, group);
- already = 0;
- for (i = 0; i < clen; i++)
- if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
- !state)
- already++;
-
- clen_changed = clen - already;
- if (state)
- mb_set_bits(bitmap_bh->b_data, blkoff, clen);
- else
- mb_clear_bits(bitmap_bh->b_data, blkoff, clen);
- if (ext4_has_group_desc_csum(sb) &&
- (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
- gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
- ext4_free_group_clusters_set(sb, gdp,
- ext4_free_clusters_after_init(sb, group, gdp));
- }
- if (state)
- clen = ext4_free_group_clusters(sb, gdp) - clen_changed;
- else
- clen = ext4_free_group_clusters(sb, gdp) + clen_changed;
-
- ext4_free_group_clusters_set(sb, gdp, clen);
- ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
- ext4_group_desc_csum_set(sb, group, gdp);
-
- ext4_unlock_group(sb, group);
-
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t flex_group = ext4_flex_group(sbi, group);
- struct flex_groups *fg = sbi_array_rcu_deref(sbi,
- s_flex_groups, flex_group);
-
- if (state)
- atomic64_sub(clen_changed, &fg->free_clusters);
- else
- atomic64_add(clen_changed, &fg->free_clusters);
-
- }
-
- err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
- if (err)
- break;
- sync_dirty_buffer(bitmap_bh);
- err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
- sync_dirty_buffer(gdp_bh);
+ err = ext4_mb_mark_context(NULL, sb, state,
+ group, blkoff, clen,
+ EXT4_MB_BITMAP_MARKED_CHECK |
+ EXT4_MB_SYNC_UPDATE,
+ NULL);
if (err)
break;
block += thisgrp_len;
len -= thisgrp_len;
- brelse(bitmap_bh);
BUG_ON(len < 0);
}
-
- if (err)
- brelse(bitmap_bh);
}
/*
@@ -5331,7 +5315,7 @@ static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
* the caller MUST hold group/inode locks.
* TODO: optimize the case when there are no in-core structures yet
*/
-static noinline_for_stack int
+static noinline_for_stack void
ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
struct ext4_prealloc_space *pa)
{
@@ -5381,11 +5365,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
*/
}
atomic_add(free, &sbi->s_mb_discarded);
-
- return 0;
}
-static noinline_for_stack int
+static noinline_for_stack void
ext4_mb_release_group_pa(struct ext4_buddy *e4b,
struct ext4_prealloc_space *pa)
{
@@ -5399,13 +5381,11 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) {
ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu",
e4b->bd_group, group, pa->pa_pstart);
- return 0;
+ return;
}
mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
-
- return 0;
}
/*
@@ -5526,7 +5506,7 @@ out_dbg:
*
* FIXME!! Make sure it is valid at all the call sites
*/
-void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
+void ext4_discard_preallocations(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct super_block *sb = inode->i_sb;
@@ -5538,9 +5518,8 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
struct rb_node *iter;
int err;
- if (!S_ISREG(inode->i_mode)) {
+ if (!S_ISREG(inode->i_mode))
return;
- }
if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
return;
@@ -5548,15 +5527,12 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
mb_debug(sb, "discard preallocation for inode %lu\n",
inode->i_ino);
trace_ext4_discard_preallocations(inode,
- atomic_read(&ei->i_prealloc_active), needed);
-
- if (needed == 0)
- needed = UINT_MAX;
+ atomic_read(&ei->i_prealloc_active));
repeat:
/* first, collect all pa's in the inode */
write_lock(&ei->i_prealloc_lock);
- for (iter = rb_first(&ei->i_prealloc_node); iter && needed;
+ for (iter = rb_first(&ei->i_prealloc_node); iter;
iter = rb_next(iter)) {
pa = rb_entry(iter, struct ext4_prealloc_space,
pa_node.inode_node);
@@ -5580,7 +5556,6 @@ repeat:
spin_unlock(&pa->pa_lock);
rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
list_add(&pa->u.pa_tmp_list, &list);
- needed--;
continue;
}
@@ -5990,7 +5965,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
/*
* release all resource we used in allocation
*/
-static int ext4_mb_release_context(struct ext4_allocation_context *ac)
+static void ext4_mb_release_context(struct ext4_allocation_context *ac)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_prealloc_space *pa = ac->ac_pa;
@@ -6020,14 +5995,13 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
ext4_mb_put_pa(ac, ac->ac_sb, pa);
}
- if (ac->ac_bitmap_page)
- put_page(ac->ac_bitmap_page);
- if (ac->ac_buddy_page)
- put_page(ac->ac_buddy_page);
+ if (ac->ac_bitmap_folio)
+ folio_put(ac->ac_bitmap_folio);
+ if (ac->ac_buddy_folio)
+ folio_put(ac->ac_buddy_folio);
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
mutex_unlock(&ac->ac_lg->lg_mutex);
ext4_mb_collect_stats(ac);
- return 0;
}
static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
@@ -6142,7 +6116,7 @@ ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp)
}
block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i);
- ext4_mb_mark_bb(sb, block, 1, 1);
+ ext4_mb_mark_bb(sb, block, 1, true);
ar->len = 1;
*errp = 0;
@@ -6340,8 +6314,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
struct rb_node *parent = NULL, *new_node;
BUG_ON(!ext4_handle_valid(handle));
- BUG_ON(e4b->bd_bitmap_page == NULL);
- BUG_ON(e4b->bd_buddy_page == NULL);
+ BUG_ON(e4b->bd_bitmap_folio == NULL);
+ BUG_ON(e4b->bd_buddy_folio == NULL);
new_node = &new_entry->efd_node;
cluster = new_entry->efd_start_cluster;
@@ -6352,8 +6326,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
* otherwise we'll refresh it from
* on-disk bitmap and lose not-yet-available
* blocks */
- get_page(e4b->bd_buddy_page);
- get_page(e4b->bd_bitmap_page);
+ folio_get(e4b->bd_buddy_folio);
+ folio_get(e4b->bd_bitmap_folio);
}
while (*n) {
parent = *n;
@@ -6391,7 +6365,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
}
spin_lock(&sbi->s_md_lock);
- list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list);
+ list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]);
sbi->s_mb_free_pending += clusters;
spin_unlock(&sbi->s_md_lock);
}
@@ -6399,43 +6373,15 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
unsigned long count)
{
- struct buffer_head *bitmap_bh;
struct super_block *sb = inode->i_sb;
- struct ext4_group_desc *gdp;
- struct buffer_head *gdp_bh;
ext4_group_t group;
ext4_grpblk_t blkoff;
- int already_freed = 0, err, i;
ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
- bitmap_bh = ext4_read_block_bitmap(sb, group);
- if (IS_ERR(bitmap_bh)) {
- pr_warn("Failed to read block bitmap\n");
- return;
- }
- gdp = ext4_get_group_desc(sb, group, &gdp_bh);
- if (!gdp)
- goto err_out;
-
- for (i = 0; i < count; i++) {
- if (!mb_test_bit(blkoff + i, bitmap_bh->b_data))
- already_freed++;
- }
- mb_clear_bits(bitmap_bh->b_data, blkoff, count);
- err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
- if (err)
- goto err_out;
- ext4_free_group_clusters_set(
- sb, gdp, ext4_free_group_clusters(sb, gdp) +
- count - already_freed);
- ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
- ext4_group_desc_csum_set(sb, group, gdp);
- ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
- sync_dirty_buffer(bitmap_bh);
- sync_dirty_buffer(gdp_bh);
-
-err_out:
- brelse(bitmap_bh);
+ ext4_mb_mark_context(NULL, sb, false, group, blkoff, count,
+ EXT4_MB_BITMAP_MARKED_CHECK |
+ EXT4_MB_SYNC_UPDATE,
+ NULL);
}
/**
@@ -6451,19 +6397,17 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
ext4_fsblk_t block, unsigned long count,
int flags)
{
- struct buffer_head *bitmap_bh = NULL;
struct super_block *sb = inode->i_sb;
- struct ext4_group_desc *gdp;
struct ext4_group_info *grp;
unsigned int overflow;
ext4_grpblk_t bit;
- struct buffer_head *gd_bh;
ext4_group_t block_group;
struct ext4_sb_info *sbi;
struct ext4_buddy e4b;
unsigned int count_clusters;
int err = 0;
- int ret;
+ int mark_flags = 0;
+ ext4_grpblk_t changed;
sbi = EXT4_SB(sb);
@@ -6472,7 +6416,7 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
ext4_error(sb, "Freeing blocks in system zone - "
"Block = %llu, count = %lu", block, count);
/* err = 0. ext4_std_error should be a no op */
- goto error_return;
+ goto error_out;
}
flags |= EXT4_FREE_BLOCKS_VALIDATED;
@@ -6496,55 +6440,35 @@ do_more:
flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
}
count_clusters = EXT4_NUM_B2C(sbi, count);
- bitmap_bh = ext4_read_block_bitmap(sb, block_group);
- if (IS_ERR(bitmap_bh)) {
- err = PTR_ERR(bitmap_bh);
- bitmap_bh = NULL;
- goto error_return;
- }
- gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
- if (!gdp) {
- err = -EIO;
- goto error_return;
- }
+ trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
+
+ /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
+ err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
+ GFP_NOFS|__GFP_NOFAIL);
+ if (err)
+ goto error_out;
if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
!ext4_inode_block_valid(inode, block, count)) {
ext4_error(sb, "Freeing blocks in system zone - "
"Block = %llu, count = %lu", block, count);
/* err = 0. ext4_std_error should be a no op */
- goto error_return;
+ goto error_clean;
}
- BUFFER_TRACE(bitmap_bh, "getting write access");
- err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
- EXT4_JTR_NONE);
- if (err)
- goto error_return;
-
- /*
- * We are about to modify some metadata. Call the journal APIs
- * to unshare ->b_data if a currently-committing transaction is
- * using it
- */
- BUFFER_TRACE(gd_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE);
- if (err)
- goto error_return;
#ifdef AGGRESSIVE_CHECK
- {
- int i;
- for (i = 0; i < count_clusters; i++)
- BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
- }
+ mark_flags |= EXT4_MB_BITMAP_MARKED_CHECK;
#endif
- trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
+ err = ext4_mb_mark_context(handle, sb, false, block_group, bit,
+ count_clusters, mark_flags, &changed);
- /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
- err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
- GFP_NOFS|__GFP_NOFAIL);
- if (err)
- goto error_return;
+
+ if (err && changed == 0)
+ goto error_clean;
+
+#ifdef AGGRESSIVE_CHECK
+ BUG_ON(changed != count_clusters);
+#endif
/*
* We need to make sure we don't reuse the freed block until after the
@@ -6568,17 +6492,18 @@ do_more:
new_entry->efd_tid = handle->h_transaction->t_tid;
ext4_lock_group(sb, block_group);
- mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
ext4_mb_free_metadata(handle, &e4b, new_entry);
} else {
- /* need to update group_info->bb_free and bitmap
- * with group lock held. generate_buddy look at
- * them with group lock_held
- */
if (test_opt(sb, DISCARD)) {
err = ext4_issue_discard(sb, block_group, bit,
- count_clusters, NULL);
- if (err && err != -EOPNOTSUPP)
+ count_clusters);
+ /*
+ * Ignore EOPNOTSUPP error. This is consistent with
+ * what happens when using journal.
+ */
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ if (err)
ext4_msg(sb, KERN_WARNING, "discard request in"
" group:%u block:%d count:%lu failed"
" with %d", block_group, bit, count,
@@ -6588,23 +6513,11 @@ do_more:
EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
ext4_lock_group(sb, block_group);
- mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
mb_free_blocks(inode, &e4b, bit, count_clusters);
}
- ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
- ext4_free_group_clusters_set(sb, gdp, ret);
- ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
- ext4_group_desc_csum_set(sb, block_group, gdp);
ext4_unlock_group(sb, block_group);
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
- atomic64_add(count_clusters,
- &sbi_array_rcu_deref(sbi, s_flex_groups,
- flex_group)->free_clusters);
- }
-
/*
* on a bigalloc file system, defer the s_freeclusters_counter
* update to the caller (ext4_remove_space and friends) so they
@@ -6617,28 +6530,18 @@ do_more:
count_clusters);
}
- ext4_mb_unload_buddy(&e4b);
-
- /* We dirtied the bitmap block */
- BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
- err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-
- /* And the group descriptor block */
- BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
- ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
- if (!err)
- err = ret;
-
if (overflow && !err) {
block += count;
count = overflow;
- put_bh(bitmap_bh);
+ ext4_mb_unload_buddy(&e4b);
/* The range changed so it's no longer validated */
flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
goto do_more;
}
-error_return:
- brelse(bitmap_bh);
+
+error_clean:
+ ext4_mb_unload_buddy(&e4b);
+error_out:
ext4_std_error(sb, err);
}
@@ -6736,7 +6639,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
for (i = 0; i < count; i++) {
cond_resched();
if (is_metadata)
- bh = sb_find_get_block(inode->i_sb, block + i);
+ bh = sb_find_get_block_nonatomic(inode->i_sb,
+ block + i);
ext4_forget(handle, is_metadata, inode, bh, block + i);
}
}
@@ -6756,23 +6660,19 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
ext4_fsblk_t block, unsigned long count)
{
- struct buffer_head *bitmap_bh = NULL;
- struct buffer_head *gd_bh;
ext4_group_t block_group;
ext4_grpblk_t bit;
- unsigned int i;
- struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_buddy e4b;
- int err = 0, ret, free_clusters_count;
- ext4_grpblk_t clusters_freed;
+ int err = 0;
ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block);
ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1);
unsigned long cluster_count = last_cluster - first_cluster + 1;
+ ext4_grpblk_t changed;
ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
- if (count == 0)
+ if (cluster_count == 0)
return 0;
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -6784,99 +6684,39 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
ext4_warning(sb, "too many blocks added to group %u",
block_group);
err = -EINVAL;
- goto error_return;
- }
-
- bitmap_bh = ext4_read_block_bitmap(sb, block_group);
- if (IS_ERR(bitmap_bh)) {
- err = PTR_ERR(bitmap_bh);
- bitmap_bh = NULL;
- goto error_return;
+ goto error_out;
}
- desc = ext4_get_group_desc(sb, block_group, &gd_bh);
- if (!desc) {
- err = -EIO;
- goto error_return;
- }
+ err = ext4_mb_load_buddy(sb, block_group, &e4b);
+ if (err)
+ goto error_out;
if (!ext4_sb_block_valid(sb, NULL, block, count)) {
ext4_error(sb, "Adding blocks in system zones - "
"Block = %llu, count = %lu",
block, count);
err = -EINVAL;
- goto error_return;
+ goto error_clean;
}
- BUFFER_TRACE(bitmap_bh, "getting write access");
- err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
- EXT4_JTR_NONE);
- if (err)
- goto error_return;
+ err = ext4_mb_mark_context(handle, sb, false, block_group, bit,
+ cluster_count, EXT4_MB_BITMAP_MARKED_CHECK,
+ &changed);
+ if (err && changed == 0)
+ goto error_clean;
- /*
- * We are about to modify some metadata. Call the journal APIs
- * to unshare ->b_data if a currently-committing transaction is
- * using it
- */
- BUFFER_TRACE(gd_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE);
- if (err)
- goto error_return;
-
- for (i = 0, clusters_freed = 0; i < cluster_count; i++) {
- BUFFER_TRACE(bitmap_bh, "clear bit");
- if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
- ext4_error(sb, "bit already cleared for block %llu",
- (ext4_fsblk_t)(block + i));
- BUFFER_TRACE(bitmap_bh, "bit already cleared");
- } else {
- clusters_freed++;
- }
- }
+ if (changed != cluster_count)
+ ext4_error(sb, "bit already cleared in group %u", block_group);
- err = ext4_mb_load_buddy(sb, block_group, &e4b);
- if (err)
- goto error_return;
-
- /*
- * need to update group_info->bb_free and bitmap
- * with group lock held. generate_buddy look at
- * them with group lock_held
- */
ext4_lock_group(sb, block_group);
- mb_clear_bits(bitmap_bh->b_data, bit, cluster_count);
mb_free_blocks(NULL, &e4b, bit, cluster_count);
- free_clusters_count = clusters_freed +
- ext4_free_group_clusters(sb, desc);
- ext4_free_group_clusters_set(sb, desc, free_clusters_count);
- ext4_block_bitmap_csum_set(sb, desc, bitmap_bh);
- ext4_group_desc_csum_set(sb, block_group, desc);
ext4_unlock_group(sb, block_group);
percpu_counter_add(&sbi->s_freeclusters_counter,
- clusters_freed);
-
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
- atomic64_add(clusters_freed,
- &sbi_array_rcu_deref(sbi, s_flex_groups,
- flex_group)->free_clusters);
- }
+ changed);
+error_clean:
ext4_mb_unload_buddy(&e4b);
-
- /* We dirtied the bitmap block */
- BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
- err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-
- /* And the group descriptor block */
- BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
- ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
- if (!err)
- err = ret;
-
-error_return:
- brelse(bitmap_bh);
+error_out:
ext4_std_error(sb, err);
return err;
}
@@ -6915,7 +6755,7 @@ __acquires(bitlock)
*/
mb_mark_used(e4b, &ex);
ext4_unlock_group(sb, group);
- ret = ext4_issue_discard(sb, group, start, count, NULL);
+ ret = ext4_issue_discard(sb, group, start, count);
ext4_lock_group(sb, group);
mb_free_blocks(NULL, e4b, start, ex.fe_len);
return ret;
@@ -7207,3 +7047,7 @@ out_unload:
return error;
}
+
+#ifdef CONFIG_EXT4_KUNIT_TESTS
+#include "mballoc-test.c"
+#endif
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index dd16050022f5..f8280de3e882 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -187,12 +187,11 @@ struct ext4_allocation_context {
struct ext4_free_extent ac_f_ex;
/*
- * goal len can change in CR1.5, so save the original len. This is
- * used while adjusting the PA window and for accounting.
+ * goal len can change in CR_BEST_AVAIL_LEN, so save the original len.
+ * This is used while adjusting the PA window and for accounting.
*/
ext4_grpblk_t ac_orig_goal_len;
- __u32 ac_groups_considered;
__u32 ac_flags; /* allocation hints */
__u32 ac_groups_linear_remaining;
__u16 ac_groups_scanned;
@@ -205,8 +204,8 @@ struct ext4_allocation_context {
__u8 ac_2order; /* if request is to allocate 2^N blocks and
* N > 0, the field stores N, otherwise 0 */
__u8 ac_op; /* operation, for history only */
- struct page *ac_bitmap_page;
- struct page *ac_buddy_page;
+ struct folio *ac_bitmap_folio;
+ struct folio *ac_buddy_folio;
struct ext4_prealloc_space *ac_pa;
struct ext4_locality_group *ac_lg;
};
@@ -216,9 +215,9 @@ struct ext4_allocation_context {
#define AC_STATUS_BREAK 3
struct ext4_buddy {
- struct page *bd_buddy_page;
+ struct folio *bd_buddy_folio;
void *bd_buddy;
- struct page *bd_bitmap_page;
+ struct folio *bd_bitmap_folio;
void *bd_bitmap;
struct ext4_group_info *bd_info;
struct super_block *bd_sb;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index a5e1492bbaaa..1b0dfd963d3f 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -37,7 +37,6 @@ static int finish_range(handle_t *handle, struct inode *inode,
path = ext4_find_extent(inode, lb->first_block, NULL, 0);
if (IS_ERR(path)) {
retval = PTR_ERR(path);
- path = NULL;
goto err_out;
}
@@ -53,7 +52,9 @@ static int finish_range(handle_t *handle, struct inode *inode,
retval = ext4_datasem_ensure_credits(handle, inode, needed, needed, 0);
if (retval < 0)
goto err_out;
- retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0);
+ path = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
+ if (IS_ERR(path))
+ retval = PTR_ERR(path);
err_out:
up_write((&EXT4_I(inode)->i_data_sem));
ext4_free_ext_path(path);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 5e6b07b34960..898443e98efc 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -17,26 +17,23 @@
* get_ext_path() - Find an extent path for designated logical block number.
* @inode: inode to be searched
* @lblock: logical block number to find an extent path
- * @ppath: pointer to an extent path pointer (for output)
+ * @path: pointer to an extent path
*
- * ext4_find_extent wrapper. Return 0 on success, or a negative error value
- * on failure.
+ * ext4_find_extent wrapper. Return an extent path pointer on success,
+ * or an error pointer on failure.
*/
-static inline int
+static inline struct ext4_ext_path *
get_ext_path(struct inode *inode, ext4_lblk_t lblock,
- struct ext4_ext_path **ppath)
+ struct ext4_ext_path *path)
{
- struct ext4_ext_path *path;
-
- path = ext4_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE);
+ path = ext4_find_extent(inode, lblock, path, EXT4_EX_NOCACHE);
if (IS_ERR(path))
- return PTR_ERR(path);
+ return path;
if (path[ext_depth(inode)].p_ext == NULL) {
ext4_free_ext_path(path);
- *ppath = NULL;
- return -ENODATA;
+ return ERR_PTR(-ENODATA);
}
- return 0;
+ return path;
}
/**
@@ -94,9 +91,11 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
int ret = 0;
ext4_lblk_t last = from + count;
while (from < last) {
- *err = get_ext_path(inode, from, &path);
- if (*err)
- goto out;
+ path = get_ext_path(inode, from, path);
+ if (IS_ERR(path)) {
+ *err = PTR_ERR(path);
+ return ret;
+ }
ext = path[ext_depth(inode)].p_ext;
if (unwritten != ext4_ext_is_unwritten(ext))
goto out;
@@ -183,14 +182,14 @@ static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to)
blocksize = i_blocksize(inode);
head = folio_buffers(folio);
- if (!head) {
- create_empty_buffers(&folio->page, blocksize, 0);
- head = folio_buffers(folio);
- }
+ if (!head)
+ head = create_empty_buffers(folio, blocksize, 0);
- block = (sector_t)folio->index << (PAGE_SHIFT - inode->i_blkbits);
- for (bh = head, block_start = 0; bh != head || !block_start;
- block++, block_start = block_end, bh = bh->b_this_page) {
+ block = folio_pos(folio) >> inode->i_blkbits;
+ block_end = 0;
+ bh = head;
+ do {
+ block_start = block_end;
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
if (!buffer_uptodate(bh))
@@ -216,7 +215,8 @@ static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to)
}
ext4_read_bh_nowait(bh, 0, NULL, false);
nr++;
- }
+ } while (block++, (bh = bh->b_this_page) != head);
+
/* No io required */
if (!nr)
goto out;
@@ -386,9 +386,10 @@ data_copy:
}
/* Perform all necessary steps similar write_begin()/write_end()
* but keeping in mind that i_size will not change */
- if (!folio_buffers(folio[0]))
- create_empty_buffers(&folio[0]->page, 1 << orig_inode->i_blkbits, 0);
bh = folio_buffers(folio[0]);
+ if (!bh)
+ bh = create_empty_buffers(folio[0],
+ 1 << orig_inode->i_blkbits, 0);
for (i = 0; i < data_offset_in_page; i++)
bh = bh->b_this_page;
for (i = 0; i < block_len_in_page; i++) {
@@ -633,9 +634,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
int offset_in_page;
int unwritten, cur_len;
- ret = get_ext_path(orig_inode, o_start, &path);
- if (ret)
+ path = get_ext_path(orig_inode, o_start, path);
+ if (IS_ERR(path)) {
+ ret = PTR_ERR(path);
goto out;
+ }
ex = path[path->p_depth].p_ext;
cur_blk = le32_to_cpu(ex->ee_block);
cur_len = ext4_ext_get_actual_len(ex);
@@ -693,8 +696,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
out:
if (*moved_len) {
- ext4_discard_preallocations(orig_inode, 0);
- ext4_discard_preallocations(donor_inode, 0);
+ ext4_discard_preallocations(orig_inode);
+ ext4_discard_preallocations(donor_inode);
}
ext4_free_ext_path(path);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 4de1f61bba76..286f8fcb74cc 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1391,62 +1391,11 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
}
#if IS_ENABLED(CONFIG_UNICODE)
-/*
- * Test whether a case-insensitive directory entry matches the filename
- * being searched for. If quick is set, assume the name being looked up
- * is already in the casefolded form.
- *
- * Returns: 0 if the directory entry matches, more than 0 if it
- * doesn't match or less than zero on error.
- */
-static int ext4_ci_compare(const struct inode *parent, const struct qstr *name,
- u8 *de_name, size_t de_name_len, bool quick)
-{
- const struct super_block *sb = parent->i_sb;
- const struct unicode_map *um = sb->s_encoding;
- struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len);
- struct qstr entry = QSTR_INIT(de_name, de_name_len);
- int ret;
-
- if (IS_ENCRYPTED(parent)) {
- const struct fscrypt_str encrypted_name =
- FSTR_INIT(de_name, de_name_len);
-
- decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL);
- if (!decrypted_name.name)
- return -ENOMEM;
- ret = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name,
- &decrypted_name);
- if (ret < 0)
- goto out;
- entry.name = decrypted_name.name;
- entry.len = decrypted_name.len;
- }
-
- if (quick)
- ret = utf8_strncasecmp_folded(um, name, &entry);
- else
- ret = utf8_strncasecmp(um, name, &entry);
- if (ret < 0) {
- /* Handle invalid character sequence as either an error
- * or as an opaque byte sequence.
- */
- if (sb_has_strict_encoding(sb))
- ret = -EINVAL;
- else if (name->len != entry.len)
- ret = 1;
- else
- ret = !!memcmp(name->name, entry.name, entry.len);
- }
-out:
- kfree(decrypted_name.name);
- return ret;
-}
-
int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
struct ext4_filename *name)
{
- struct fscrypt_str *cf_name = &name->cf_name;
+ struct qstr *cf_name = &name->cf_name;
+ unsigned char *buf;
struct dx_hash_info *hinfo = &name->hinfo;
int len;
@@ -1456,18 +1405,18 @@ int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
return 0;
}
- cf_name->name = kmalloc(EXT4_NAME_LEN, GFP_NOFS);
- if (!cf_name->name)
+ buf = kmalloc(EXT4_NAME_LEN, GFP_NOFS);
+ if (!buf)
return -ENOMEM;
- len = utf8_casefold(dir->i_sb->s_encoding,
- iname, cf_name->name,
- EXT4_NAME_LEN);
+ len = utf8_casefold(dir->i_sb->s_encoding, iname, buf, EXT4_NAME_LEN);
if (len <= 0) {
- kfree(cf_name->name);
- cf_name->name = NULL;
+ kfree(buf);
+ buf = NULL;
}
+ cf_name->name = buf;
cf_name->len = (unsigned) len;
+
if (!IS_ENCRYPTED(dir))
return 0;
@@ -1503,22 +1452,29 @@ static bool ext4_match(struct inode *parent,
#if IS_ENABLED(CONFIG_UNICODE)
if (IS_CASEFOLDED(parent) &&
(!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) {
- if (fname->cf_name.name) {
- struct qstr cf = {.name = fname->cf_name.name,
- .len = fname->cf_name.len};
- if (IS_ENCRYPTED(parent)) {
- if (fname->hinfo.hash != EXT4_DIRENT_HASH(de) ||
- fname->hinfo.minor_hash !=
- EXT4_DIRENT_MINOR_HASH(de)) {
-
- return false;
- }
- }
- return !ext4_ci_compare(parent, &cf, de->name,
- de->name_len, true);
- }
- return !ext4_ci_compare(parent, fname->usr_fname, de->name,
- de->name_len, false);
+ /*
+ * Just checking IS_ENCRYPTED(parent) below is not
+ * sufficient to decide whether one can use the hash for
+ * skipping the string comparison, because the key might
+ * have been added right after
+ * ext4_fname_setup_ci_filename(). In this case, a hash
+ * mismatch will be a false negative. Therefore, make
+ * sure cf_name was properly initialized before
+ * considering the calculated hash.
+ */
+ if (IS_ENCRYPTED(parent) && fname->cf_name.name &&
+ (fname->hinfo.hash != EXT4_DIRENT_HASH(de) ||
+ fname->hinfo.minor_hash != EXT4_DIRENT_MINOR_HASH(de)))
+ return false;
+ /*
+ * Treat comparison errors as not a match. The
+ * only case where it happens is on a disk
+ * corruption or ENOMEM.
+ */
+
+ return generic_ci_match(parent, fname->usr_fname,
+ &fname->cf_name, de->name,
+ de->name_len) > 0;
}
#endif
@@ -1618,7 +1574,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
&has_inline_data);
if (inlined)
*inlined = has_inline_data;
- if (has_inline_data)
+ if (has_inline_data || IS_ERR(ret))
goto cleanup_and_exit;
}
@@ -1765,7 +1721,6 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir,
struct buffer_head *bh;
err = ext4_fname_prepare_lookup(dir, dentry, &fname);
- generic_set_encrypted_ci_d_ops(dentry);
if (err == -ENOENT)
return NULL;
if (err)
@@ -1873,8 +1828,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
}
}
-#if IS_ENABLED(CONFIG_UNICODE)
- if (!inode && IS_CASEFOLDED(dir)) {
+ if (IS_ENABLED(CONFIG_UNICODE) && !inode && IS_CASEFOLDED(dir)) {
/* Eventually we want to call d_add_ci(dentry, NULL)
* for negative dentries in the encoding case as
* well. For now, prevent the negative dentry
@@ -1882,7 +1836,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
*/
return NULL;
}
-#endif
+
return d_splice_alias(inode, dentry);
}
@@ -2041,7 +1995,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
* split it in half by count; each resulting block will have at least
* half the space free.
*/
- if (i > 0)
+ if (i >= 0)
split = count - move;
else
split = count/2;
@@ -2210,7 +2164,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
* happen is that the times are slightly out of date
* and/or different from the directory change time.
*/
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
ext4_update_dx_flag(dir);
inode_inc_iversion(dir);
err2 = ext4_mark_inode_dirty(handle, dir);
@@ -2329,8 +2283,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
top = data2 + len;
while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) {
if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len,
- (data2 + (blocksize - csum_size) -
- (char *) de))) {
+ (char *)de - data2)) {
brelse(bh2);
brelse(bh);
return -EFSCORRUPTED;
@@ -2438,8 +2391,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
sb = dir->i_sb;
blocksize = sb->s_blocksize;
- if (!dentry->d_name.len)
- return -EINVAL;
if (fscrypt_is_nokey_name(dentry))
return -ENOKEY;
@@ -3248,7 +3199,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
* recovery. */
inode->i_size = 0;
ext4_orphan_add(handle, inode);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
inode_set_ctime_current(inode);
retval = ext4_mark_inode_dirty(handle, inode);
if (retval)
@@ -3258,16 +3209,14 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
ext4_fc_track_unlink(handle, dentry);
retval = ext4_mark_inode_dirty(handle, dir);
-#if IS_ENABLED(CONFIG_UNICODE)
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
* invalidating the dentries here, alongside with returning the
* negative dentries at ext4_lookup(), when it is better
* supported by the VFS for the CI case.
*/
- if (IS_CASEFOLDED(dir))
+ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
d_invalidate(dentry);
-#endif
end_rmdir:
brelse(bh);
@@ -3323,7 +3272,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto out_handle;
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
ext4_update_dx_flag(dir);
retval = ext4_mark_inode_dirty(handle, dir);
if (retval)
@@ -3369,16 +3318,15 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
goto out_trace;
retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry), dentry);
-#if IS_ENABLED(CONFIG_UNICODE)
+
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
* invalidating the dentries here, alongside with returning the
* negative dentries at ext4_lookup(), when it is better
* supported by the VFS for the CI case.
*/
- if (IS_CASEFOLDED(dir))
+ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
d_invalidate(dentry);
-#endif
out_trace:
trace_ext4_unlink_exit(dentry, retval);
@@ -3635,10 +3583,14 @@ struct ext4_renament {
int dir_inlined;
};
-static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent)
+static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent, bool is_cross)
{
int retval;
+ ent->is_dir = true;
+ if (!is_cross)
+ return 0;
+
ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode,
&retval, &ent->parent_de,
&ent->dir_inlined);
@@ -3656,6 +3608,9 @@ static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent,
{
int retval;
+ if (!ent->dir_bh)
+ return 0;
+
ent->parent_de->inode = cpu_to_le32(dir_ino);
BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata");
if (!ent->dir_inlined) {
@@ -3691,7 +3646,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
if (ext4_has_feature_filetype(ent->dir->i_sb))
ent->de->file_type = file_type;
inode_inc_iversion(ent->dir);
- ent->dir->i_mtime = inode_set_ctime_current(ent->dir);
+ inode_set_mtime_to_ts(ent->dir, inode_set_ctime_current(ent->dir));
retval = ext4_mark_inode_dirty(handle, ent->dir);
BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
if (!ent->inlined) {
@@ -3944,7 +3899,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir))
goto end_rename;
}
- retval = ext4_rename_dir_prepare(handle, &old);
+ retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir);
if (retval)
goto end_rename;
}
@@ -4006,9 +3961,9 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
ext4_dec_count(new.inode);
inode_set_ctime_current(new.inode);
}
- old.dir->i_mtime = inode_set_ctime_current(old.dir);
+ inode_set_mtime_to_ts(old.dir, inode_set_ctime_current(old.dir));
ext4_update_dx_flag(old.dir);
- if (old.dir_bh) {
+ if (old.is_dir) {
retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
if (retval)
goto end_rename;
@@ -4031,7 +3986,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (unlikely(retval))
goto end_rename;
- if (S_ISDIR(old.inode->i_mode)) {
+ if (old.is_dir) {
/*
* We disable fast commits here that's because the
* replay code is not yet capable of changing dot dot
@@ -4158,14 +4113,12 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
ext4_handle_sync(handle);
if (S_ISDIR(old.inode->i_mode)) {
- old.is_dir = true;
- retval = ext4_rename_dir_prepare(handle, &old);
+ retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir);
if (retval)
goto end_rename;
}
if (S_ISDIR(new.inode->i_mode)) {
- new.is_dir = true;
- retval = ext4_rename_dir_prepare(handle, &new);
+ retval = ext4_rename_dir_prepare(handle, &new, new.dir != old.dir);
if (retval)
goto end_rename;
}
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
index e5b47dda3317..a23b0c01f809 100644
--- a/fs/ext4/orphan.c
+++ b/fs/ext4/orphan.c
@@ -590,8 +590,9 @@ int ext4_init_orphan_info(struct super_block *sb)
}
oi->of_blocks = inode->i_size >> sb->s_blocksize_bits;
oi->of_csum_seed = EXT4_I(inode)->i_csum_seed;
- oi->of_binfo = kmalloc(oi->of_blocks*sizeof(struct ext4_orphan_block),
- GFP_KERNEL);
+ oi->of_binfo = kmalloc_array(oi->of_blocks,
+ sizeof(struct ext4_orphan_block),
+ GFP_KERNEL);
if (!oi->of_binfo) {
ret = -ENOMEM;
goto out_put;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 7ab4f5a9bf5b..cb023922c93c 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -181,14 +181,25 @@ static int ext4_end_io_end(ext4_io_end_t *io_end)
"list->prev 0x%p\n",
io_end, inode->i_ino, io_end->list.next, io_end->list.prev);
- io_end->handle = NULL; /* Following call will use up the handle */
- ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
+ /*
+ * Do not convert the unwritten extents if data writeback fails,
+ * or stale data may be exposed.
+ */
+ io_end->handle = NULL; /* Following call will use up the handle */
+ if (unlikely(io_end->flag & EXT4_IO_END_FAILED)) {
+ ret = -EIO;
+ if (handle)
+ jbd2_journal_free_reserved(handle);
+ } else {
+ ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
+ }
if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) {
ext4_msg(inode->i_sb, KERN_EMERG,
"failed to convert unwritten extents to written "
"extents -- potential data loss! "
"(inode %lu, error %d)", inode->i_ino, ret);
}
+
ext4_clear_io_unwritten_flag(io_end);
ext4_release_io_end(io_end);
return ret;
@@ -344,6 +355,7 @@ static void ext4_end_bio(struct bio *bio)
bio->bi_status, inode->i_ino,
(unsigned long long)
bi_sector >> (inode->i_blkbits - 9));
+ io_end->flag |= EXT4_IO_END_FAILED;
mapping_set_error(inode->i_mapping,
blk_status_to_errno(bio->bi_status));
}
@@ -421,7 +433,7 @@ submit_and_retry:
io_submit_init_bio(io, bh);
if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh)))
goto submit_and_retry;
- wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size);
+ wbc_account_cgroup_owner(io->io_wbc, folio, bh->b_size);
io->io_next_block++;
}
@@ -441,7 +453,7 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
BUG_ON(folio_test_writeback(folio));
/*
- * Comments copied from block_write_full_page:
+ * Comments copied from block_write_full_folio:
*
* The folio straddles i_size. It must be zeroed out on each and every
* writepage invocation because it may be mmapped. "A file is mapped
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 8cb83e7b699b..5d3a9dc9a32d 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -70,15 +70,8 @@ static void __read_end_io(struct bio *bio)
{
struct folio_iter fi;
- bio_for_each_folio_all(fi, bio) {
- struct folio *folio = fi.folio;
-
- if (bio->bi_status)
- folio_clear_uptodate(folio);
- else
- folio_mark_uptodate(folio);
- folio_unlock(folio);
- }
+ bio_for_each_folio_all(fi, bio)
+ folio_end_read(fi.folio, bio->bi_status == 0);
if (bio->bi_private)
mempool_free(bio->bi_private, bio_post_read_ctx_pool);
bio_put(bio);
@@ -228,7 +221,7 @@ int ext4_mpage_readpages(struct inode *inode,
sector_t block_in_file;
sector_t last_block;
sector_t last_block_in_file;
- sector_t blocks[MAX_BUF_PER_PAGE];
+ sector_t first_block;
unsigned page_block;
struct block_device *bdev = inode->i_sb->s_bdev;
int length;
@@ -270,6 +263,7 @@ int ext4_mpage_readpages(struct inode *inode,
unsigned map_offset = block_in_file - map.m_lblk;
unsigned last = map.m_len - map_offset;
+ first_block = map.m_pblk + map_offset;
for (relative_block = 0; ; relative_block++) {
if (relative_block == last) {
/* needed? */
@@ -278,8 +272,6 @@ int ext4_mpage_readpages(struct inode *inode,
}
if (page_block == blocks_per_page)
break;
- blocks[page_block] = map.m_pblk + map_offset +
- relative_block;
page_block++;
block_in_file++;
}
@@ -314,7 +306,9 @@ int ext4_mpage_readpages(struct inode *inode,
goto confused; /* hole -> non-hole */
/* Contiguous blocks? */
- if (page_block && blocks[page_block-1] != map.m_pblk-1)
+ if (!page_block)
+ first_block = map.m_pblk;
+ else if (first_block + page_block != map.m_pblk)
goto confused;
for (relative_block = 0; ; relative_block++) {
if (relative_block == map.m_len) {
@@ -323,7 +317,6 @@ int ext4_mpage_readpages(struct inode *inode,
break;
} else if (page_block == blocks_per_page)
break;
- blocks[page_block] = map.m_pblk+relative_block;
page_block++;
block_in_file++;
}
@@ -335,8 +328,7 @@ int ext4_mpage_readpages(struct inode *inode,
if (ext4_need_verity(inode, folio->index) &&
!fsverity_verify_folio(folio))
goto set_error_page;
- folio_mark_uptodate(folio);
- folio_unlock(folio);
+ folio_end_read(folio, true);
continue;
}
} else if (fully_mapped) {
@@ -347,7 +339,7 @@ int ext4_mpage_readpages(struct inode *inode,
* This folio will go to BIO. Do we need to send this
* BIO off first?
*/
- if (bio && (last_block_in_bio != blocks[0] - 1 ||
+ if (bio && (last_block_in_bio != first_block - 1 ||
!fscrypt_mergeable_bio(bio, inode, next_block))) {
submit_and_realloc:
submit_bio(bio);
@@ -363,7 +355,7 @@ int ext4_mpage_readpages(struct inode *inode,
fscrypt_set_bio_crypt_ctx(bio, inode, next_block,
GFP_KERNEL);
ext4_set_bio_post_read_ctx(bio, inode, folio->index);
- bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
+ bio->bi_iter.bi_sector = first_block << (blkbits - 9);
bio->bi_end_io = mpage_end_io;
if (rac)
bio->bi_opf |= REQ_RAHEAD;
@@ -379,7 +371,7 @@ int ext4_mpage_readpages(struct inode *inode,
submit_bio(bio);
bio = NULL;
} else
- last_block_in_bio = blocks[blocks_per_page - 1];
+ last_block_in_bio = first_block + blocks_per_page - 1;
continue;
confused:
if (bio) {
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b34007541e08..72f77f78ae8d 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -10,8 +10,6 @@
*/
-#define EXT4FS_DEBUG
-
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/jiffies.h>
@@ -57,7 +55,7 @@ int ext4_resize_begin(struct super_block *sb)
* If the reserved GDT blocks is non-zero, the resize_inode feature
* should always be set.
*/
- if (EXT4_SB(sb)->s_es->s_reserved_gdt_blocks &&
+ if (sbi->s_es->s_reserved_gdt_blocks &&
!ext4_has_feature_resize_inode(sb)) {
ext4_error(sb, "resize_inode disabled but reserved GDT blocks non-zero");
return -EFSCORRUPTED;
@@ -69,9 +67,9 @@ int ext4_resize_begin(struct super_block *sb)
* bad time to do it anyways.
*/
if (EXT4_B2C(sbi, sbi->s_sbh->b_blocknr) !=
- le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
+ le32_to_cpu(sbi->s_es->s_first_data_block)) {
ext4_warning(sb, "won't resize using backup superblock at %llu",
- (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
+ (unsigned long long)sbi->s_sbh->b_blocknr);
return -EPERM;
}
@@ -79,7 +77,7 @@ int ext4_resize_begin(struct super_block *sb)
* We are not allowed to do online-resizing on a filesystem mounted
* with error, because it can destroy the filesystem easily.
*/
- if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+ if (sbi->s_mount_state & EXT4_ERROR_FS) {
ext4_warning(sb, "There are errors in the filesystem, "
"so online resizing is not allowed");
return -EPERM;
@@ -91,7 +89,7 @@ int ext4_resize_begin(struct super_block *sb)
}
if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING,
- &EXT4_SB(sb)->s_ext4_flags))
+ &sbi->s_ext4_flags))
ret = -EBUSY;
return ret;
@@ -106,18 +104,6 @@ int ext4_resize_end(struct super_block *sb, bool update_backups)
return 0;
}
-static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb,
- ext4_group_t group) {
- return (group >> EXT4_DESC_PER_BLOCK_BITS(sb)) <<
- EXT4_DESC_PER_BLOCK_BITS(sb);
-}
-
-static ext4_fsblk_t ext4_meta_bg_first_block_no(struct super_block *sb,
- ext4_group_t group) {
- group = ext4_meta_bg_first_group(sb, group);
- return ext4_group_first_block_no(sb, group);
-}
-
static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb,
ext4_group_t group) {
ext4_grpblk_t overhead;
@@ -154,8 +140,9 @@ static int verify_group_input(struct super_block *sb,
overhead = ext4_group_overhead_blocks(sb, group);
metaend = start + overhead;
- input->free_clusters_count = free_blocks_count =
- input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
+ free_blocks_count = input->blocks_count - 2 - overhead -
+ sbi->s_itb_per_group;
+ input->free_clusters_count = EXT4_B2C(sbi, free_blocks_count);
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks "
@@ -243,23 +230,35 @@ struct ext4_new_flex_group_data {
#define MAX_RESIZE_BG 16384
/*
- * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of
- * @flexbg_size.
+ * alloc_flex_gd() allocates an ext4_new_flex_group_data that satisfies the
+ * resizing from @o_group to @n_group, its size is typically @flexbg_size.
*
* Returns NULL on failure otherwise address of the allocated structure.
*/
-static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned int flexbg_size)
+static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned int flexbg_size,
+ ext4_group_t o_group, ext4_group_t n_group)
{
+ ext4_group_t last_group;
+ unsigned int max_resize_bg;
struct ext4_new_flex_group_data *flex_gd;
flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS);
if (flex_gd == NULL)
goto out3;
- if (unlikely(flexbg_size > MAX_RESIZE_BG))
- flex_gd->resize_bg = MAX_RESIZE_BG;
- else
- flex_gd->resize_bg = flexbg_size;
+ max_resize_bg = umin(flexbg_size, MAX_RESIZE_BG);
+ flex_gd->resize_bg = max_resize_bg;
+
+ /* Avoid allocating large 'groups' array if not needed */
+ last_group = o_group | (flex_gd->resize_bg - 1);
+ if (n_group <= last_group)
+ flex_gd->resize_bg = 1 << fls(n_group - o_group);
+ else if (n_group - last_group < flex_gd->resize_bg)
+ flex_gd->resize_bg = 1 << max(fls(last_group - o_group),
+ fls(n_group - last_group));
+
+ if (WARN_ON_ONCE(flex_gd->resize_bg > max_resize_bg))
+ flex_gd->resize_bg = max_resize_bg;
flex_gd->groups = kmalloc_array(flex_gd->resize_bg,
sizeof(struct ext4_new_group_data),
@@ -468,8 +467,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
ext4_debug("mark clusters [%llu-%llu] used\n", first_cluster,
last_cluster);
- for (count2 = count; count > 0;
- count -= count2, first_cluster += count2) {
+ for (; count > 0; count -= count2, first_cluster += count2) {
ext4_fsblk_t start;
struct buffer_head *bh;
ext4_group_t group;
@@ -617,7 +615,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
}
handle_itb:
- /* Initialize group tables of the grop @group */
+ /* Initialize group tables of the group @group */
if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED))
goto handle_bb;
@@ -707,16 +705,14 @@ handle_ib:
block = start;
}
- if (count) {
- err = set_flexbg_block_bitmap(sb, handle,
- flex_gd,
- EXT4_B2C(sbi, start),
- EXT4_B2C(sbi,
- start + count
- - 1));
- if (err)
- goto out;
- }
+ err = set_flexbg_block_bitmap(sb, handle,
+ flex_gd,
+ EXT4_B2C(sbi, start),
+ EXT4_B2C(sbi,
+ start + count
+ - 1));
+ if (err)
+ goto out;
}
out:
@@ -955,7 +951,13 @@ errout:
}
/*
- * add_new_gdb_meta_bg is the sister of add_new_gdb.
+ * If there is no available space in the existing block group descriptors for
+ * the new block group and there are no reserved block group descriptors, then
+ * the meta_bg feature will get enabled, and es->s_first_meta_bg will get set
+ * to the first block group that is managed using meta_bg and s_first_meta_bg
+ * must be a multiple of EXT4_DESC_PER_BLOCK(sb).
+ * This function will be called when first group of meta_bg is added to bring
+ * new group descriptors block of new added meta_bg.
*/
static int add_new_gdb_meta_bg(struct super_block *sb,
handle_t *handle, ext4_group_t group) {
@@ -965,8 +967,8 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
int err;
- gdblock = ext4_meta_bg_first_block_no(sb, group) +
- ext4_bg_has_super(sb, group);
+ gdblock = ext4_group_first_block_no(sb, group) +
+ ext4_bg_has_super(sb, group);
gdb_bh = ext4_sb_bread(sb, gdblock, 0);
if (IS_ERR(gdb_bh))
return PTR_ERR(gdb_bh);
@@ -1090,9 +1092,6 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
for (i = 0; i < reserved_gdb; i++) {
int err2;
data = (__le32 *)primary[i]->b_data;
- /* printk("reserving backup %lu[%u] = %lu\n",
- primary[i]->b_blocknr, gdbackups,
- blk + primary[i]->b_blocknr); */
data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]);
if (!err)
@@ -1322,8 +1321,7 @@ static int ext4_set_bitmap_checksums(struct super_block *sb,
bh = ext4_get_bitmap(sb, group_data->inode_bitmap);
if (!bh)
return -EIO;
- ext4_inode_bitmap_csum_set(sb, gdp, bh,
- EXT4_INODES_PER_GROUP(sb) / 8);
+ ext4_inode_bitmap_csum_set(sb, gdp, bh);
brelse(bh);
bh = ext4_get_bitmap(sb, group_data->block_bitmap);
@@ -1609,7 +1607,6 @@ exit_journal:
gdb_num >= le32_to_cpu(es->s_first_meta_bg);
sector_t padding_blocks = meta_bg ? 0 : sbi->s_sbh->b_blocknr -
ext4_group_first_block_no(sb, 0);
- sector_t old_gdb = 0;
update_backups(sb, ext4_group_first_block_no(sb, 0),
(char *)es, sizeof(struct ext4_super_block), 0);
@@ -1618,11 +1615,8 @@ exit_journal:
gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc,
gdb_num);
- if (old_gdb == gdb_bh->b_blocknr)
- continue;
update_backups(sb, gdb_bh->b_blocknr - padding_blocks,
gdb_bh->b_data, gdb_bh->b_size, meta_bg);
- old_gdb = gdb_bh->b_blocknr;
}
}
exit:
@@ -2092,7 +2086,7 @@ retry:
}
}
- if ((!resize_inode && !meta_bg) || n_blocks_count == o_blocks_count) {
+ if ((!resize_inode && !meta_bg && n_desc_blocks > o_desc_blocks) || n_blocks_count == o_blocks_count) {
err = ext4_convert_meta_bg(sb, resize_inode);
if (err)
goto out;
@@ -2149,7 +2143,7 @@ retry:
if (err)
goto out;
- flex_gd = alloc_flex_gd(flexbg_size);
+ flex_gd = alloc_flex_gd(flexbg_size, o_group, n_group);
if (flex_gd == NULL) {
err = -ENOMEM;
goto out;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2346ef071b24..722ac723f49b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -251,18 +251,25 @@ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb,
struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
blk_opf_t op_flags)
{
- return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE);
+ gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping,
+ ~__GFP_FS) | __GFP_MOVABLE;
+
+ return __ext4_sb_bread_gfp(sb, block, op_flags, gfp);
}
struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
sector_t block)
{
- return __ext4_sb_bread_gfp(sb, block, 0, 0);
+ gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping,
+ ~__GFP_FS);
+
+ return __ext4_sb_bread_gfp(sb, block, 0, gfp);
}
void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
{
- struct buffer_head *bh = sb_getblk_gfp(sb, block, 0);
+ struct buffer_head *bh = bdev_getblk(sb->s_bdev, block,
+ sb->s_blocksize, GFP_NOWAIT | __GFP_NOWARN);
if (likely(bh)) {
if (trylock_buffer(bh))
@@ -492,22 +499,6 @@ static void ext4_maybe_update_superblock(struct super_block *sb)
schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
}
-/*
- * The del_gendisk() function uninitializes the disk-specific data
- * structures, including the bdi structure, without telling anyone
- * else. Once this happens, any attempt to call mark_buffer_dirty()
- * (for example, by ext4_commit_super), will cause a kernel OOPS.
- * This is a kludge to prevent these oops until we can put in a proper
- * hook in del_gendisk() to inform the VFS and file system layers.
- */
-static int block_device_ejected(struct super_block *sb)
-{
- struct inode *bd_inode = sb->s_bdev->bd_inode;
- struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
-
- return bdi->dev == NULL;
-}
-
static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
{
struct super_block *sb = journal->j_private;
@@ -1344,6 +1335,9 @@ static void ext4_put_super(struct super_block *sb)
ext4_group_desc_free(sbi);
ext4_flex_groups_free(sbi);
+
+ WARN_ON_ONCE(!(sbi->s_mount_state & EXT4_ERROR_FS) &&
+ percpu_counter_sum(&sbi->s_dirtyclusters_counter));
ext4_percpu_param_destroy(sbi);
#ifdef CONFIG_QUOTA
for (int i = 0; i < EXT4_MAXQUOTAS; i++)
@@ -1360,14 +1354,14 @@ static void ext4_put_super(struct super_block *sb)
sync_blockdev(sb->s_bdev);
invalidate_bdev(sb->s_bdev);
- if (sbi->s_journal_bdev) {
+ if (sbi->s_journal_bdev_file) {
/*
* Invalidate the journal device's buffers. We don't want them
* floating about in memory - the physical journal device may
* hotswapped, and it breaks the `ro-after' testing code.
*/
- sync_blockdev(sbi->s_journal_bdev);
- invalidate_bdev(sbi->s_journal_bdev);
+ sync_blockdev(file_bdev(sbi->s_journal_bdev_file));
+ invalidate_bdev(file_bdev(sbi->s_journal_bdev_file));
}
ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
@@ -1474,7 +1468,8 @@ static void ext4_destroy_inode(struct inode *inode)
dump_stack();
}
- if (EXT4_I(inode)->i_reserved_data_blocks)
+ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS) &&
+ WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks))
ext4_msg(inode->i_sb, KERN_ERR,
"Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!",
inode->i_ino, EXT4_I(inode),
@@ -1501,8 +1496,7 @@ static int __init init_inodecache(void)
{
ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache",
sizeof(struct ext4_inode_info), 0,
- (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
- SLAB_ACCOUNT),
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
offsetof(struct ext4_inode_info, i_data),
sizeof_field(struct ext4_inode_info, i_data),
init_once);
@@ -1526,7 +1520,7 @@ void ext4_clear_inode(struct inode *inode)
ext4_fc_del(inode);
invalidate_inode_buffers(inode);
clear_inode(inode);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
dquot_drop(inode);
if (EXT4_I(inode)->jinode) {
@@ -1655,6 +1649,7 @@ static const struct super_operations ext4_sops = {
};
static const struct export_operations ext4_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = ext4_fh_to_dentry,
.fh_to_parent = ext4_fh_to_parent,
.get_parent = ext4_get_parent,
@@ -1724,10 +1719,6 @@ static const struct constant_table ext4_param_dax[] = {
{}
};
-/* String parameter that allows empty argument */
-#define fsparam_string_empty(NAME, OPT) \
- __fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL)
-
/*
* Mount option specification
* We don't use fsparam_flag_no because of the way we set the
@@ -1742,8 +1733,8 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
fsparam_flag ("bsdgroups", Opt_grpid),
fsparam_flag ("nogrpid", Opt_nogrpid),
fsparam_flag ("sysvgroups", Opt_nogrpid),
- fsparam_u32 ("resgid", Opt_resgid),
- fsparam_u32 ("resuid", Opt_resuid),
+ fsparam_gid ("resgid", Opt_resgid),
+ fsparam_uid ("resuid", Opt_resuid),
fsparam_u32 ("sb", Opt_sb),
fsparam_enum ("errors", Opt_errors, ext4_param_errors),
fsparam_flag ("nouid32", Opt_nouid32),
@@ -2028,6 +2019,9 @@ int ext4_init_fs_context(struct fs_context *fc)
fc->fs_private = ctx;
fc->ops = &ext4_context_ops;
+ /* i_version is always enabled now */
+ fc->sb_flags |= SB_I_VERSION;
+
return 0;
}
@@ -2079,8 +2073,7 @@ static int unnote_qf_name(struct fs_context *fc, int qtype)
{
struct ext4_fs_context *ctx = fc->fs_private;
- if (ctx->s_qf_names[qtype])
- kfree(ctx->s_qf_names[qtype]);
+ kfree(ctx->s_qf_names[qtype]);
ctx->s_qf_names[qtype] = NULL;
ctx->qname_spec |= 1 << qtype;
@@ -2149,8 +2142,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
struct fs_parse_result result;
const struct mount_opts *m;
int is_remount;
- kuid_t uid;
- kgid_t gid;
int token;
token = fs_parse(fc, ext4_param_specs, param, &result);
@@ -2292,23 +2283,11 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
ctx->spec |= EXT4_SPEC_s_stripe;
return 0;
case Opt_resuid:
- uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(uid)) {
- ext4_msg(NULL, KERN_ERR, "Invalid uid value %d",
- result.uint_32);
- return -EINVAL;
- }
- ctx->s_resuid = uid;
+ ctx->s_resuid = result.uid;
ctx->spec |= EXT4_SPEC_s_resuid;
return 0;
case Opt_resgid:
- gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(gid)) {
- ext4_msg(NULL, KERN_ERR, "Invalid gid value %d",
- result.uint_32);
- return -EINVAL;
- }
- ctx->s_resgid = gid;
+ ctx->s_resgid = result.gid;
ctx->spec |= EXT4_SPEC_s_resgid;
return 0;
case Opt_journal_dev:
@@ -2485,8 +2464,7 @@ static int parse_options(struct fs_context *fc, char *options)
param.size = v_len;
ret = ext4_parse_param(fc, &param);
- if (param.string)
- kfree(param.string);
+ kfree(param.string);
if (ret < 0)
return ret;
}
@@ -2793,15 +2771,6 @@ static int ext4_check_opt_consistency(struct fs_context *fc,
return -EINVAL;
}
- if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DIOREAD_NOLOCK)) {
- int blocksize =
- BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
- if (blocksize < PAGE_SIZE)
- ext4_msg(NULL, KERN_WARNING, "Warning: mounting with an "
- "experimental mount option 'dioread_nolock' "
- "for blocksize < PAGE_SIZE");
- }
-
err = ext4_check_test_dummy_encryption(fc, sb);
if (err)
return err;
@@ -2821,6 +2790,13 @@ static int ext4_check_opt_consistency(struct fs_context *fc,
}
if (is_remount) {
+ if (!sbi->s_journal &&
+ ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT)) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Remounting fs w/o journal so ignoring data_err option");
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT);
+ }
+
if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) &&
(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
ext4_msg(NULL, KERN_ERR, "can't mount with "
@@ -3087,7 +3063,7 @@ int ext4_seq_options_show(struct seq_file *seq, void *offset)
seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw");
rc = _ext4_show_options(seq, sb, 1);
- seq_puts(seq, "\n");
+ seq_putc(seq, '\n');
return rc;
}
@@ -3618,14 +3594,12 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly)
return 0;
}
-#if !IS_ENABLED(CONFIG_UNICODE)
- if (ext4_has_feature_casefold(sb)) {
+ if (!IS_ENABLED(CONFIG_UNICODE) && ext4_has_feature_casefold(sb)) {
ext4_msg(sb, KERN_ERR,
"Filesystem with casefold feature cannot be "
"mounted without CONFIG_UNICODE");
return 0;
}
-#endif
if (readonly)
return 1;
@@ -4242,7 +4216,7 @@ int ext4_calculate_overhead(struct super_block *sb)
* Add the internal journal blocks whether the journal has been
* loaded or not
*/
- if (sbi->s_journal && !sbi->s_journal_bdev)
+ if (sbi->s_journal && !sbi->s_journal_bdev_file)
overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);
else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
/* j_inum for internal journal is non-zero */
@@ -4410,7 +4384,7 @@ static void ext4_set_def_opts(struct super_block *sb,
((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
set_opt(sb, DELALLOC);
- if (sb->s_blocksize == PAGE_SIZE)
+ if (sb->s_blocksize <= PAGE_SIZE)
set_opt(sb, DIOREAD_NOLOCK);
}
@@ -4431,22 +4405,6 @@ static int ext4_handle_clustersize(struct super_block *sb)
}
sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
le32_to_cpu(es->s_log_block_size);
- sbi->s_clusters_per_group =
- le32_to_cpu(es->s_clusters_per_group);
- if (sbi->s_clusters_per_group > sb->s_blocksize * 8) {
- ext4_msg(sb, KERN_ERR,
- "#clusters per group too big: %lu",
- sbi->s_clusters_per_group);
- return -EINVAL;
- }
- if (sbi->s_blocks_per_group !=
- (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) {
- ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
- "clusters per group (%lu) inconsistent",
- sbi->s_blocks_per_group,
- sbi->s_clusters_per_group);
- return -EINVAL;
- }
} else {
if (clustersize != sb->s_blocksize) {
ext4_msg(sb, KERN_ERR,
@@ -4460,9 +4418,21 @@ static int ext4_handle_clustersize(struct super_block *sb)
sbi->s_blocks_per_group);
return -EINVAL;
}
- sbi->s_clusters_per_group = sbi->s_blocks_per_group;
sbi->s_cluster_bits = 0;
}
+ sbi->s_clusters_per_group = le32_to_cpu(es->s_clusters_per_group);
+ if (sbi->s_clusters_per_group > sb->s_blocksize * 8) {
+ ext4_msg(sb, KERN_ERR, "#clusters per group too big: %lu",
+ sbi->s_clusters_per_group);
+ return -EINVAL;
+ }
+ if (sbi->s_blocks_per_group !=
+ (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) {
+ ext4_msg(sb, KERN_ERR,
+ "blocks per group (%lu) and clusters per group (%lu) inconsistent",
+ sbi->s_blocks_per_group, sbi->s_clusters_per_group);
+ return -EINVAL;
+ }
sbi->s_cluster_ratio = clustersize / sb->s_blocksize;
/* Do we have standard group size of clustersize * 8 blocks ? */
@@ -5135,16 +5105,27 @@ out:
return ret;
}
-static void ext4_hash_info_init(struct super_block *sb)
+static int ext4_hash_info_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
unsigned int i;
+ sbi->s_def_hash_version = es->s_def_hash_version;
+
+ if (sbi->s_def_hash_version > DX_HASH_LAST) {
+ ext4_msg(sb, KERN_ERR,
+ "Invalid default hash set in the superblock");
+ return -EINVAL;
+ } else if (sbi->s_def_hash_version == DX_HASH_SIPHASH) {
+ ext4_msg(sb, KERN_ERR,
+ "SIPHASH is not a valid default hash value");
+ return -EINVAL;
+ }
+
for (i = 0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
- sbi->s_def_hash_version = es->s_def_hash_version;
if (ext4_has_feature_dir_index(sb)) {
i = le32_to_cpu(es->s_flags);
if (i & EXT2_FLAGS_UNSIGNED_HASH)
@@ -5162,6 +5143,7 @@ static void ext4_hash_info_init(struct super_block *sb)
#endif
}
}
+ return 0;
}
static int ext4_block_group_meta_init(struct super_block *sb, int silent)
@@ -5298,9 +5280,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
- /* i_version is always enabled now */
- sb->s_flags |= SB_I_VERSION;
-
err = ext4_check_feature_compatibility(sb, es, silent);
if (err)
goto failed_mount;
@@ -5309,7 +5288,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (err)
goto failed_mount;
- ext4_hash_info_init(sb);
+ err = ext4_hash_info_init(sb);
+ if (err)
+ goto failed_mount;
err = ext4_handle_clustersize(sb);
if (err)
@@ -5361,11 +5342,14 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
sb->s_qcop = &ext4_qctl_operations;
sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
#endif
- memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
+ super_set_uuid(sb, es->s_uuid, sizeof(es->s_uuid));
+ super_set_sysfs_name_bdev(sb);
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
mutex_init(&sbi->s_orphan_lock);
+ spin_lock_init(&sbi->s_bdev_wb_lock);
+
ext4_fast_commit_init(sb);
sb->s_root = NULL;
@@ -5389,6 +5373,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
err = ext4_load_and_init_journal(sb, es, ctx);
if (err)
goto failed_mount3a;
+ if (bdev_read_only(sb->s_bdev))
+ needs_recovery = 0;
} else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) &&
ext4_has_feature_journal_needs_recovery(sb)) {
ext4_msg(sb, KERN_ERR, "required journal recovery "
@@ -5419,6 +5405,11 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
"data=, fs mounted w/o journal");
goto failed_mount3a;
}
+ if (test_opt(sb, DATA_ERR_ABORT)) {
+ ext4_msg(sb, KERN_ERR,
+ "can't mount with data_err=abort, fs mounted w/o journal");
+ goto failed_mount3a;
+ }
sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
clear_opt(sb, JOURNAL_CHECKSUM);
clear_opt(sb, DATA_FLAGS);
@@ -5499,6 +5490,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
goto failed_mount4;
}
+ generic_set_sb_d_ops(sb);
sb->s_root = d_make_root(root);
if (!sb->s_root) {
ext4_msg(sb, KERN_ERR, "get root dentry failed");
@@ -5586,8 +5578,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
* Save the original bdev mapping's wb_err value which could be
* used to detect the metadata async write error.
*/
- spin_lock_init(&sbi->s_bdev_wb_lock);
- errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err,
+ errseq_check_and_advance(&sb->s_bdev->bd_mapping->wb_err,
&sbi->s_bdev_wb_err);
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
ext4_orphan_cleanup(sb, es);
@@ -5683,9 +5674,9 @@ failed_mount:
#endif
fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
brelse(sbi->s_sbh);
- if (sbi->s_journal_bdev) {
- invalidate_bdev(sbi->s_journal_bdev);
- blkdev_put(sbi->s_journal_bdev, sb);
+ if (sbi->s_journal_bdev_file) {
+ invalidate_bdev(file_bdev(sbi->s_journal_bdev_file));
+ bdev_fput(sbi->s_journal_bdev_file);
}
out_fail:
invalidate_bdev(sb->s_bdev);
@@ -5855,30 +5846,30 @@ static journal_t *ext4_open_inode_journal(struct super_block *sb,
return journal;
}
-static struct block_device *ext4_get_journal_blkdev(struct super_block *sb,
+static struct file *ext4_get_journal_blkdev(struct super_block *sb,
dev_t j_dev, ext4_fsblk_t *j_start,
ext4_fsblk_t *j_len)
{
struct buffer_head *bh;
struct block_device *bdev;
+ struct file *bdev_file;
int hblock, blocksize;
ext4_fsblk_t sb_block;
unsigned long offset;
struct ext4_super_block *es;
int errno;
- /* see get_tree_bdev why this is needed and safe */
- up_write(&sb->s_umount);
- bdev = blkdev_get_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb,
- &fs_holder_ops);
- down_write(&sb->s_umount);
- if (IS_ERR(bdev)) {
+ bdev_file = bdev_file_open_by_dev(j_dev,
+ BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES,
+ sb, &fs_holder_ops);
+ if (IS_ERR(bdev_file)) {
ext4_msg(sb, KERN_ERR,
"failed to open journal device unknown-block(%u,%u) %ld",
- MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev));
- return ERR_CAST(bdev);
+ MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_file));
+ return bdev_file;
}
+ bdev = file_bdev(bdev_file);
blocksize = sb->s_blocksize;
hblock = bdev_logical_block_size(bdev);
if (blocksize < hblock) {
@@ -5890,7 +5881,7 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb,
sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
offset = EXT4_MIN_BLOCK_SIZE % blocksize;
- set_blocksize(bdev, blocksize);
+ set_blocksize(bdev_file, blocksize);
bh = __bread(bdev, sb_block, blocksize);
if (!bh) {
ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
@@ -5925,12 +5916,12 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb,
*j_start = sb_block + 1;
*j_len = ext4_blocks_count(es);
brelse(bh);
- return bdev;
+ return bdev_file;
out_bh:
brelse(bh);
out_bdev:
- blkdev_put(bdev, sb);
+ bdev_fput(bdev_file);
return ERR_PTR(errno);
}
@@ -5940,14 +5931,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb,
journal_t *journal;
ext4_fsblk_t j_start;
ext4_fsblk_t j_len;
- struct block_device *journal_bdev;
+ struct file *bdev_file;
int errno = 0;
- journal_bdev = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
- if (IS_ERR(journal_bdev))
- return ERR_CAST(journal_bdev);
+ bdev_file = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
+ if (IS_ERR(bdev_file))
+ return ERR_CAST(bdev_file);
- journal = jbd2_journal_init_dev(journal_bdev, sb->s_bdev, j_start,
+ journal = jbd2_journal_init_dev(file_bdev(bdev_file), sb->s_bdev, j_start,
j_len, sb->s_blocksize);
if (IS_ERR(journal)) {
ext4_msg(sb, KERN_ERR, "failed to create device journal");
@@ -5962,14 +5953,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb,
goto out_journal;
}
journal->j_private = sb;
- EXT4_SB(sb)->s_journal_bdev = journal_bdev;
+ EXT4_SB(sb)->s_journal_bdev_file = bdev_file;
ext4_init_journal_params(sb, journal);
return journal;
out_journal:
jbd2_journal_destroy(journal);
out_bdev:
- blkdev_put(journal_bdev, sb);
+ bdev_fput(bdev_file);
return ERR_PTR(errno);
}
@@ -6147,8 +6138,8 @@ static void ext4_update_super(struct super_block *sb)
__ext4_update_tstamp(&es->s_first_error_time,
&es->s_first_error_time_hi,
sbi->s_first_error_time);
- strncpy(es->s_first_error_func, sbi->s_first_error_func,
- sizeof(es->s_first_error_func));
+ strtomem_pad(es->s_first_error_func,
+ sbi->s_first_error_func, 0);
es->s_first_error_line =
cpu_to_le32(sbi->s_first_error_line);
es->s_first_error_ino =
@@ -6161,8 +6152,7 @@ static void ext4_update_super(struct super_block *sb)
__ext4_update_tstamp(&es->s_last_error_time,
&es->s_last_error_time_hi,
sbi->s_last_error_time);
- strncpy(es->s_last_error_func, sbi->s_last_error_func,
- sizeof(es->s_last_error_func));
+ strtomem_pad(es->s_last_error_func, sbi->s_last_error_func, 0);
es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line);
es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino);
es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block);
@@ -6189,8 +6179,6 @@ static int ext4_commit_super(struct super_block *sb)
if (!sbh)
return -EINVAL;
- if (block_device_ejected(sb))
- return -ENODEV;
ext4_update_super(sb);
@@ -6770,6 +6758,7 @@ static int ext4_reconfigure(struct fs_context *fc)
{
struct super_block *sb = fc->root->d_sb;
int ret;
+ bool old_ro = sb_rdonly(sb);
fc->s_fs_info = EXT4_SB(sb);
@@ -6781,9 +6770,9 @@ static int ext4_reconfigure(struct fs_context *fc)
if (ret < 0)
return ret;
- ext4_msg(sb, KERN_INFO, "re-mounted %pU %s. Quota mode: %s.",
- &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w",
- ext4_quota_mode(sb));
+ ext4_msg(sb, KERN_INFO, "re-mounted %pU%s.",
+ &sb->s_uuid,
+ (old_ro != sb_rdonly(sb)) ? (sb_rdonly(sb) ? " ro" : " r/w") : "");
return 0;
}
@@ -6807,22 +6796,29 @@ static int ext4_statfs_project(struct super_block *sb,
dquot->dq_dqb.dqb_bhardlimit);
limit >>= sb->s_blocksize_bits;
- if (limit && buf->f_blocks > limit) {
+ if (limit) {
+ uint64_t remaining = 0;
+
curblock = (dquot->dq_dqb.dqb_curspace +
dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
- buf->f_blocks = limit;
- buf->f_bfree = buf->f_bavail =
- (buf->f_blocks > curblock) ?
- (buf->f_blocks - curblock) : 0;
+ if (limit > curblock)
+ remaining = limit - curblock;
+
+ buf->f_blocks = min(buf->f_blocks, limit);
+ buf->f_bfree = min(buf->f_bfree, remaining);
+ buf->f_bavail = min(buf->f_bavail, remaining);
}
limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
dquot->dq_dqb.dqb_ihardlimit);
- if (limit && buf->f_files > limit) {
- buf->f_files = limit;
- buf->f_ffree =
- (buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
- (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
+ if (limit) {
+ uint64_t remaining = 0;
+
+ if (limit > dquot->dq_dqb.dqb_curinodes)
+ remaining = limit - dquot->dq_dqb.dqb_curinodes;
+
+ buf->f_files = min(buf->f_files, limit);
+ buf->f_ffree = min(buf->f_ffree, remaining);
}
spin_unlock(&dquot->dq_dqb_lock);
@@ -6925,12 +6921,25 @@ static int ext4_release_dquot(struct dquot *dquot)
{
int ret, err;
handle_t *handle;
+ bool freeze_protected = false;
+
+ /*
+ * Trying to sb_start_intwrite() in a running transaction
+ * can result in a deadlock. Further, running transactions
+ * are already protected from freezing.
+ */
+ if (!ext4_journal_current_handle()) {
+ sb_start_intwrite(dquot->dq_sb);
+ freeze_protected = true;
+ }
handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
if (IS_ERR(handle)) {
/* Release dquot anyway to avoid endless cycle in dqput() */
dquot_release(dquot);
+ if (freeze_protected)
+ sb_end_intwrite(dquot->dq_sb);
return PTR_ERR(handle);
}
ret = dquot_release(dquot);
@@ -6941,6 +6950,10 @@ static int ext4_release_dquot(struct dquot *dquot)
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
+
+ if (freeze_protected)
+ sb_end_intwrite(dquot->dq_sb);
+
return ret;
}
@@ -7180,7 +7193,7 @@ static int ext4_quota_off(struct super_block *sb, int type)
}
EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
err = ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
out_unlock:
@@ -7353,12 +7366,12 @@ static inline int ext3_feature_set_ok(struct super_block *sb)
static void ext4_kill_sb(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct block_device *journal_bdev = sbi ? sbi->s_journal_bdev : NULL;
+ struct file *bdev_file = sbi ? sbi->s_journal_bdev_file : NULL;
kill_block_super(sb);
- if (journal_bdev)
- blkdev_put(journal_bdev, sb);
+ if (bdev_file)
+ bdev_fput(bdev_file);
}
static struct file_system_type ext4_fs_type = {
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 75bf1f88843c..645240cc0229 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -92,10 +92,12 @@ static const char *ext4_get_link(struct dentry *dentry, struct inode *inode,
if (!dentry) {
bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT);
- if (IS_ERR(bh))
- return ERR_CAST(bh);
- if (!bh || !ext4_buffer_uptodate(bh))
+ if (IS_ERR(bh) || !bh)
return ERR_PTR(-ECHILD);
+ if (!ext4_buffer_uptodate(bh)) {
+ brelse(bh);
+ return ERR_PTR(-ECHILD);
+ }
} else {
bh = ext4_bread(NULL, inode, 0, 0);
if (IS_ERR(bh))
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index d65dccb44ed5..ddb54608ca2e 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -30,7 +30,9 @@ typedef enum {
attr_first_error_time,
attr_last_error_time,
attr_clusters_in_group,
+ attr_mb_order,
attr_feature,
+ attr_pointer_pi,
attr_pointer_ui,
attr_pointer_ul,
attr_pointer_u64,
@@ -179,6 +181,9 @@ static struct ext4_attr ext4_attr_##_name = { \
#define EXT4_RO_ATTR_ES_STRING(_name,_elname,_size) \
EXT4_ATTR_STRING(_name, 0444, _size, ext4_super_block, _elname)
+#define EXT4_RW_ATTR_SBI_PI(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0644, pointer_pi, ext4_sb_info, _elname)
+
#define EXT4_RW_ATTR_SBI_UI(_name,_elname) \
EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname)
@@ -210,6 +215,8 @@ EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
ext4_sb_info, s_inode_readahead_blks);
EXT4_ATTR_OFFSET(mb_group_prealloc, 0644, clusters_in_group,
ext4_sb_info, s_mb_group_prealloc);
+EXT4_ATTR_OFFSET(mb_best_avail_max_trim_order, 0644, mb_order,
+ ext4_sb_info, s_mb_best_avail_max_trim_order);
EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
@@ -219,13 +226,12 @@ EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups);
EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
-EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
-EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
-EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
-EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
-EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
-EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
-EXT4_RW_ATTR_SBI_UI(mb_best_avail_max_trim_order, s_mb_best_avail_max_trim_order);
+EXT4_RW_ATTR_SBI_PI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_PI(err_ratelimit_burst, s_err_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_PI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_PI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_PI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_PI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
#ifdef CONFIG_EXT4_DEBUG
EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail);
#endif
@@ -368,13 +374,45 @@ static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi)
#define print_tstamp(buf, es, tstamp) \
__print_tstamp(buf, (es)->tstamp, (es)->tstamp ## _hi)
+static ssize_t ext4_generic_attr_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ void *ptr = calc_ptr(a, sbi);
+
+ if (!ptr)
+ return 0;
+
+ switch (a->attr_id) {
+ case attr_inode_readahead:
+ case attr_clusters_in_group:
+ case attr_mb_order:
+ case attr_pointer_pi:
+ case attr_pointer_ui:
+ if (a->attr_ptr == ptr_ext4_super_block_offset)
+ return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr));
+ return sysfs_emit(buf, "%u\n", *((unsigned int *) ptr));
+ case attr_pointer_ul:
+ return sysfs_emit(buf, "%lu\n", *((unsigned long *) ptr));
+ case attr_pointer_u8:
+ return sysfs_emit(buf, "%u\n", *((unsigned char *) ptr));
+ case attr_pointer_u64:
+ if (a->attr_ptr == ptr_ext4_super_block_offset)
+ return sysfs_emit(buf, "%llu\n", le64_to_cpup(ptr));
+ return sysfs_emit(buf, "%llu\n", *((unsigned long long *) ptr));
+ case attr_pointer_string:
+ return sysfs_emit(buf, "%.*s\n", a->attr_size, (char *) ptr);
+ case attr_pointer_atomic:
+ return sysfs_emit(buf, "%d\n", atomic_read((atomic_t *) ptr));
+ }
+ return 0;
+}
+
static ssize_t ext4_attr_show(struct kobject *kobj,
struct attribute *attr, char *buf)
{
struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
s_kobj);
struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
- void *ptr = calc_ptr(a, sbi);
switch (a->attr_id) {
case attr_delayed_allocation_blocks:
@@ -393,46 +431,6 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
return sysfs_emit(buf, "%llu\n",
(unsigned long long)
percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit));
- case attr_inode_readahead:
- case attr_clusters_in_group:
- case attr_pointer_ui:
- if (!ptr)
- return 0;
- if (a->attr_ptr == ptr_ext4_super_block_offset)
- return sysfs_emit(buf, "%u\n",
- le32_to_cpup(ptr));
- else
- return sysfs_emit(buf, "%u\n",
- *((unsigned int *) ptr));
- case attr_pointer_ul:
- if (!ptr)
- return 0;
- return sysfs_emit(buf, "%lu\n",
- *((unsigned long *) ptr));
- case attr_pointer_u8:
- if (!ptr)
- return 0;
- return sysfs_emit(buf, "%u\n",
- *((unsigned char *) ptr));
- case attr_pointer_u64:
- if (!ptr)
- return 0;
- if (a->attr_ptr == ptr_ext4_super_block_offset)
- return sysfs_emit(buf, "%llu\n",
- le64_to_cpup(ptr));
- else
- return sysfs_emit(buf, "%llu\n",
- *((unsigned long long *) ptr));
- case attr_pointer_string:
- if (!ptr)
- return 0;
- return sysfs_emit(buf, "%.*s\n", a->attr_size,
- (char *) ptr);
- case attr_pointer_atomic:
- if (!ptr)
- return 0;
- return sysfs_emit(buf, "%d\n",
- atomic_read((atomic_t *) ptr));
case attr_feature:
return sysfs_emit(buf, "supported\n");
case attr_first_error_time:
@@ -441,29 +439,33 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
return print_tstamp(buf, sbi->s_es, s_last_error_time);
case attr_journal_task:
return journal_task_show(sbi, buf);
+ default:
+ return ext4_generic_attr_show(a, sbi, buf);
}
-
- return 0;
}
-static ssize_t ext4_attr_store(struct kobject *kobj,
- struct attribute *attr,
- const char *buf, size_t len)
+static ssize_t ext4_generic_attr_store(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t len)
{
- struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
- s_kobj);
- struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
- void *ptr = calc_ptr(a, sbi);
+ int ret;
unsigned int t;
unsigned long lt;
- int ret;
+ void *ptr = calc_ptr(a, sbi);
+
+ if (!ptr)
+ return 0;
switch (a->attr_id) {
- case attr_reserved_clusters:
- return reserved_clusters_store(sbi, buf, len);
+ case attr_pointer_pi:
+ ret = kstrtouint(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ if ((int)t < 0)
+ return -EINVAL;
+ *((unsigned int *) ptr) = t;
+ return len;
case attr_pointer_ui:
- if (!ptr)
- return 0;
ret = kstrtouint(skip_spaces(buf), 0, &t);
if (ret)
return ret;
@@ -472,9 +474,15 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
else
*((unsigned int *) ptr) = t;
return len;
+ case attr_mb_order:
+ ret = kstrtouint(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ if (t > 64)
+ return -EINVAL;
+ *((unsigned int *) ptr) = t;
+ return len;
case attr_clusters_in_group:
- if (!ptr)
- return 0;
ret = kstrtouint(skip_spaces(buf), 0, &t);
if (ret)
return ret;
@@ -483,19 +491,33 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
*((unsigned int *) ptr) = t;
return len;
case attr_pointer_ul:
- if (!ptr)
- return 0;
ret = kstrtoul(skip_spaces(buf), 0, &lt);
if (ret)
return ret;
*((unsigned long *) ptr) = lt;
return len;
+ }
+ return 0;
+}
+
+static ssize_t ext4_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+ s_kobj);
+ struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+ switch (a->attr_id) {
+ case attr_reserved_clusters:
+ return reserved_clusters_store(sbi, buf, len);
case attr_inode_readahead:
return inode_readahead_blks_store(sbi, buf, len);
case attr_trigger_test_error:
return trigger_test_error(sbi, buf, len);
+ default:
+ return ext4_generic_attr_store(a, sbi, buf, len);
}
- return 0;
}
static void ext4_sb_release(struct kobject *kobj)
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index 2f37e1ea3955..d9203228ce97 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -76,17 +76,17 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count,
while (count) {
size_t n = min_t(size_t, count,
PAGE_SIZE - offset_in_page(pos));
- struct page *page;
+ struct folio *folio;
void *fsdata = NULL;
int res;
- res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata);
+ res = aops->write_begin(NULL, mapping, pos, n, &folio, &fsdata);
if (res)
return res;
- memcpy_to_page(page, offset_in_page(pos), buf, n);
+ memcpy_to_folio(folio, offset_in_folio(folio, pos), buf, n);
- res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata);
+ res = aops->write_end(NULL, mapping, pos, n, n, folio, fsdata);
if (res < 0)
return res;
if (res != n)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index f40785bc4e55..6ff94cdf1515 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -98,7 +98,7 @@ static const struct xattr_handler * const ext4_xattr_handler_map[] = {
[EXT4_XATTR_INDEX_HURD] = &ext4_xattr_hurd_handler,
};
-const struct xattr_handler *ext4_xattr_handlers[] = {
+const struct xattr_handler * const ext4_xattr_handlers[] = {
&ext4_xattr_user_handler,
&ext4_xattr_trusted_handler,
#ifdef CONFIG_EXT4_FS_SECURITY
@@ -356,7 +356,7 @@ ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)
{
- return ((u64) inode_get_ctime(ea_inode).tv_sec << 32) |
+ return ((u64) inode_get_ctime_sec(ea_inode) << 32) |
(u32) inode_peek_iversion_raw(ea_inode);
}
@@ -368,12 +368,12 @@ static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count)
static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode)
{
- return (u32)ea_inode->i_atime.tv_sec;
+ return (u32) inode_get_atime_sec(ea_inode);
}
static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash)
{
- ea_inode->i_atime.tv_sec = hash;
+ inode_set_atime(ea_inode, hash, 0);
}
/*
@@ -418,7 +418,7 @@ free_bhs:
return ret;
}
-#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec)
+#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode_get_mtime_sec(inode)))
static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
u32 ea_inode_hash, struct inode **ea_inode)
@@ -1176,15 +1176,24 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
{
struct inode *ea_inode;
struct ext4_xattr_entry *entry;
+ struct ext4_iloc iloc;
bool dirty = false;
unsigned int ea_ino;
int err;
int credits;
+ void *end;
+
+ if (block_csum)
+ end = (void *)bh->b_data + bh->b_size;
+ else {
+ ext4_get_inode_loc(parent, &iloc);
+ end = (void *)ext4_raw_inode(&iloc) + EXT4_SB(parent->i_sb)->s_inode_size;
+ }
/* One credit for dec ref on ea_inode, one for orphan list addition, */
credits = 2 + extra_credits;
- for (entry = first; !IS_LAST_ENTRY(entry);
+ for (entry = first; (void *)entry < end && !IS_LAST_ENTRY(entry);
entry = EXT4_XATTR_NEXT(entry)) {
if (!entry->e_value_inum)
continue;
@@ -2034,8 +2043,13 @@ clone_block:
inserted:
if (!IS_LAST_ENTRY(s->first)) {
- new_bh = ext4_xattr_block_cache_find(inode, header(s->base),
- &ce);
+ new_bh = ext4_xattr_block_cache_find(inode, header(s->base), &ce);
+ if (IS_ERR(new_bh)) {
+ error = PTR_ERR(new_bh);
+ new_bh = NULL;
+ goto cleanup;
+ }
+
if (new_bh) {
/* We found an identical block in the cache. */
if (new_bh == bs->bh)
@@ -2875,33 +2889,31 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
if (*ea_inode_array == NULL) {
/*
* Start with 15 inodes, so it fits into a power-of-two size.
- * If *ea_inode_array is NULL, this is essentially offsetof()
*/
- (*ea_inode_array) =
- kmalloc(offsetof(struct ext4_xattr_inode_array,
- inodes[EIA_MASK]),
- GFP_NOFS);
+ (*ea_inode_array) = kmalloc(
+ struct_size(*ea_inode_array, inodes, EIA_MASK),
+ GFP_NOFS);
if (*ea_inode_array == NULL)
return -ENOMEM;
(*ea_inode_array)->count = 0;
} else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) {
/* expand the array once all 15 + n * 16 slots are full */
struct ext4_xattr_inode_array *new_array = NULL;
- int count = (*ea_inode_array)->count;
- /* if new_array is NULL, this is essentially offsetof() */
new_array = kmalloc(
- offsetof(struct ext4_xattr_inode_array,
- inodes[count + EIA_INCR]),
- GFP_NOFS);
+ struct_size(*ea_inode_array, inodes,
+ (*ea_inode_array)->count + EIA_INCR),
+ GFP_NOFS);
if (new_array == NULL)
return -ENOMEM;
memcpy(new_array, *ea_inode_array,
- offsetof(struct ext4_xattr_inode_array, inodes[count]));
+ struct_size(*ea_inode_array, inodes,
+ (*ea_inode_array)->count));
kfree(*ea_inode_array);
*ea_inode_array = new_array;
}
- (*ea_inode_array)->inodes[(*ea_inode_array)->count++] = inode;
+ (*ea_inode_array)->count++;
+ (*ea_inode_array)->inodes[(*ea_inode_array)->count - 1] = inode;
return 0;
}
@@ -3032,8 +3044,6 @@ void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
*
* Create a new entry in the extended attribute block cache, and insert
* it unless such an entry is already in the cache.
- *
- * Returns 0, or a negative error number on failure.
*/
static void
ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache,
@@ -3061,8 +3071,7 @@ ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache,
*
* Compare two extended attribute blocks for equality.
*
- * Returns 0 if the blocks are equal, 1 if they differ, and
- * a negative error number on errors.
+ * Returns 0 if the blocks are equal, 1 if they differ.
*/
static int
ext4_xattr_cmp(struct ext4_xattr_header *header1,
@@ -3101,8 +3110,8 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1,
*
* Find an identical extended attribute block.
*
- * Returns a pointer to the block found, or NULL if such a block was
- * not found or an error occurred.
+ * Returns a pointer to the block found, or NULL if such a block was not
+ * found, or an error pointer if an error occurred while reading ea block.
*/
static struct buffer_head *
ext4_xattr_block_cache_find(struct inode *inode,
@@ -3124,13 +3133,11 @@ ext4_xattr_block_cache_find(struct inode *inode,
bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO);
if (IS_ERR(bh)) {
- if (PTR_ERR(bh) == -ENOMEM) {
- mb_cache_entry_put(ea_block_cache, ce);
- return NULL;
- }
- bh = NULL;
- EXT4_ERROR_INODE(inode, "block %lu read error",
- (unsigned long)ce->e_value);
+ if (PTR_ERR(bh) != -ENOMEM)
+ EXT4_ERROR_INODE(inode, "block %lu read error",
+ (unsigned long)ce->e_value);
+ mb_cache_entry_put(ea_block_cache, ce);
+ return bh;
} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
*pce = ce;
return bh;
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 824faf0b15a8..b25c2d7b5f99 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -32,8 +32,7 @@ struct ext4_xattr_header {
__le32 h_refcount; /* reference count */
__le32 h_blocks; /* number of disk blocks used */
__le32 h_hash; /* hash value of all attributes */
- __le32 h_checksum; /* crc32c(uuid+id+xattrblock) */
- /* id = inum if refcount=1, blknum otherwise */
+ __le32 h_checksum; /* crc32c(uuid+blknum+xattrblock) */
__u32 h_reserved[3]; /* zero right now */
};
@@ -130,8 +129,8 @@ struct ext4_xattr_ibody_find {
};
struct ext4_xattr_inode_array {
- unsigned int count; /* # of used items in the array */
- struct inode *inodes[];
+ unsigned int count;
+ struct inode *inodes[] __counted_by(count);
};
extern const struct xattr_handler ext4_xattr_user_handler;
@@ -193,7 +192,7 @@ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
struct ext4_inode *raw_inode, handle_t *handle);
extern void ext4_evict_ea_inode(struct inode *inode);
-extern const struct xattr_handler *ext4_xattr_handlers[];
+extern const struct xattr_handler * const ext4_xattr_handlers[];
extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
struct ext4_xattr_ibody_find *is);