diff options
Diffstat (limited to 'fs/ext4')
-rw-r--r-- | fs/ext4/Kconfig | 1 | ||||
-rw-r--r-- | fs/ext4/acl.c | 2 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 92 | ||||
-rw-r--r-- | fs/ext4/ext4_jbd2.c | 3 | ||||
-rw-r--r-- | fs/ext4/extents.c | 12 | ||||
-rw-r--r-- | fs/ext4/file.c | 11 | ||||
-rw-r--r-- | fs/ext4/ialloc.c | 2 | ||||
-rw-r--r-- | fs/ext4/inline.c | 4 | ||||
-rw-r--r-- | fs/ext4/inode-test.c | 6 | ||||
-rw-r--r-- | fs/ext4/inode.c | 24 | ||||
-rw-r--r-- | fs/ext4/ioctl.c | 9 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 172 | ||||
-rw-r--r-- | fs/ext4/move_extent.c | 19 | ||||
-rw-r--r-- | fs/ext4/namei.c | 26 | ||||
-rw-r--r-- | fs/ext4/super.c | 73 | ||||
-rw-r--r-- | fs/ext4/xattr.c | 20 |
16 files changed, 292 insertions, 184 deletions
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 86699c8cab28..e20d59221fc0 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -28,6 +28,7 @@ config EXT3_FS_SECURITY config EXT4_FS tristate "The Extended 4 (ext4) filesystem" + select BUFFER_HEAD select JBD2 select CRC16 select CRYPTO diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 27fcbddfb148..3bffe862f954 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -259,7 +259,7 @@ retry: error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */); if (!error && update_mode) { inode->i_mode = mode; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); error = ext4_mark_inode_dirty(handle, inode); } out_stop: diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0a2d55faa095..481491e892df 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -868,64 +868,70 @@ struct ext4_inode { * affected filesystem before 2242. */ -static inline __le32 ext4_encode_extra_time(struct timespec64 *time) +static inline __le32 ext4_encode_extra_time(struct timespec64 ts) { - u32 extra =((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK; - return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS)); + u32 extra = ((ts.tv_sec - (s32)ts.tv_sec) >> 32) & EXT4_EPOCH_MASK; + return cpu_to_le32(extra | (ts.tv_nsec << EXT4_EPOCH_BITS)); } -static inline void ext4_decode_extra_time(struct timespec64 *time, - __le32 extra) +static inline struct timespec64 ext4_decode_extra_time(__le32 base, + __le32 extra) { + struct timespec64 ts = { .tv_sec = (signed)le32_to_cpu(base) }; + if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) - time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; - time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; + ts.tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; + ts.tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; + return ts; } -#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ +#define EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, ts) \ do { \ - if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\ - (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ - (raw_inode)->xtime ## _extra = \ - ext4_encode_extra_time(&(inode)->xtime); \ - } \ - else \ - (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (inode)->xtime.tv_sec, S32_MIN, S32_MAX)); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ + (raw_inode)->xtime = cpu_to_le32((ts).tv_sec); \ + (raw_inode)->xtime ## _extra = ext4_encode_extra_time(ts); \ + } else \ + (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX)); \ } while (0) -#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ -do { \ - if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ - (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \ - if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ - (raw_inode)->xtime ## _extra = \ - ext4_encode_extra_time(&(einode)->xtime); \ -} while (0) +#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ + EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, (inode)->xtime) + +#define EXT4_INODE_SET_CTIME(inode, raw_inode) \ + EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode)) + +#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + EXT4_INODE_SET_XTIME_VAL(xtime, &((einode)->vfs_inode), \ + raw_inode, (einode)->xtime) + +#define EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode) \ + (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra) ? \ + ext4_decode_extra_time((raw_inode)->xtime, \ + (raw_inode)->xtime ## _extra) : \ + (struct timespec64) { \ + .tv_sec = (signed)le32_to_cpu((raw_inode)->xtime) \ + }) #define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \ do { \ - (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ - if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ - ext4_decode_extra_time(&(inode)->xtime, \ - raw_inode->xtime ## _extra); \ - } \ - else \ - (inode)->xtime.tv_nsec = 0; \ + (inode)->xtime = EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode); \ } while (0) +#define EXT4_INODE_GET_CTIME(inode, raw_inode) \ +do { \ + inode_set_ctime_to_ts(inode, \ + EXT4_INODE_GET_XTIME_VAL(i_ctime, inode, raw_inode)); \ +} while (0) -#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ -do { \ - if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ - (einode)->xtime.tv_sec = \ - (signed)le32_to_cpu((raw_inode)->xtime); \ - else \ - (einode)->xtime.tv_sec = 0; \ - if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ - ext4_decode_extra_time(&(einode)->xtime, \ - raw_inode->xtime ## _extra); \ - else \ - (einode)->xtime.tv_nsec = 0; \ +#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (einode)->xtime = \ + EXT4_INODE_GET_XTIME_VAL(xtime, &(einode->vfs_inode), \ + raw_inode); \ + else \ + (einode)->xtime = (struct timespec64){0, 0}; \ } while (0) #define i_disk_version osd1.linux1.l_i_version @@ -3774,8 +3780,6 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); } -#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) - /* For ioend & aio unwritten conversion wait queues */ #define EXT4_WQ_HASH_SZ 37 #define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 77f318ec8abb..b38d59581411 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -234,8 +234,7 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, might_sleep(); - if (bh->b_bdev->bd_super) - ext4_check_bdev_write_error(bh->b_bdev->bd_super); + ext4_check_bdev_write_error(sb); if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e4115d338f10..202c76996b62 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4476,12 +4476,12 @@ retry: map.m_lblk += ret; map.m_len = len = len - ret; epos = (loff_t)map.m_lblk << inode->i_blkbits; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); if (new_size) { if (epos > new_size) epos = new_size; if (ext4_update_inode_size(inode, epos) & 0x1) - inode->i_mtime = inode->i_ctime; + inode->i_mtime = inode_get_ctime(inode); } ret2 = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); @@ -4617,7 +4617,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, /* Now release the pages and zero block aligned part of pages */ truncate_pagecache_range(inode, start, end - 1); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); @@ -4642,7 +4642,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, goto out_mutex; } - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); if (new_size) ext4_update_inode_size(inode, new_size); ret = ext4_mark_inode_dirty(handle, inode); @@ -5378,7 +5378,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); ret = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); @@ -5488,7 +5488,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; EXT4_I(inode)->i_disksize += len; - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); ret = ext4_mark_inode_dirty(handle, inode); if (ret) goto out_stop; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index c457c8517f0f..2dc3f8301225 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -723,8 +723,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } #ifdef CONFIG_FS_DAX -static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, - enum page_entry_size pe_size) +static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order) { int error = 0; vm_fault_t result; @@ -740,7 +739,7 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, * read-only. * * We check for VM_SHARED rather than vmf->cow_page since the latter is - * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for + * unset for order != 0 (i.e. only in do_cow_fault); for * other sizes, dax_iomap_fault will handle splitting / fallback so that * we eventually come back with a COW page. */ @@ -764,7 +763,7 @@ retry: } else { filemap_invalidate_lock_shared(mapping); } - result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops); + result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops); if (write) { ext4_journal_stop(handle); @@ -773,7 +772,7 @@ retry: goto retry; /* Handling synchronous page fault? */ if (result & VM_FAULT_NEEDDSYNC) - result = dax_finish_sync_fault(vmf, pe_size, pfn); + result = dax_finish_sync_fault(vmf, order, pfn); filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); } else { @@ -785,7 +784,7 @@ retry: static vm_fault_t ext4_dax_fault(struct vm_fault *vmf) { - return ext4_dax_huge_fault(vmf, PE_SIZE_PTE); + return ext4_dax_huge_fault(vmf, 0); } static const struct vm_operations_struct ext4_dax_vm_ops = { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 754f961cd9fd..48abef5f23e7 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1250,7 +1250,7 @@ got: inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); ei->i_crtime = inode->i_mtime; memset(ei->i_data, 0, sizeof(ei->i_data)); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index a4b7e4bc32d4..003861037374 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1037,7 +1037,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, * happen is that the times are slightly out of date * and/or different from the directory change time. */ - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); ext4_update_dx_flag(dir); inode_inc_iversion(dir); return 1; @@ -1991,7 +1991,7 @@ out: ext4_orphan_del(handle, inode); if (err == 0) { - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); err = ext4_mark_inode_dirty(handle, inode); if (IS_SYNC(inode)) ext4_handle_sync(handle); diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c index 7935ea6cf92c..f0c0fd507fbc 100644 --- a/fs/ext4/inode-test.c +++ b/fs/ext4/inode-test.c @@ -245,9 +245,9 @@ static void inode_test_xtimestamp_decoding(struct kunit *test) struct timestamp_expectation *test_param = (struct timestamp_expectation *)(test->param_value); - timestamp.tv_sec = get_32bit_time(test_param); - ext4_decode_extra_time(×tamp, - cpu_to_le32(test_param->extra_bits)); + timestamp = ext4_decode_extra_time( + cpu_to_le32(get_32bit_time(test_param)), + cpu_to_le32(test_param->extra_bits)); KUNIT_EXPECT_EQ_MSG(test, test_param->expected.tv_sec, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 43775a6ca505..89737d5a1614 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1569,7 +1569,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, if (folio->index < mpd->first_page) continue; - if (folio->index + folio_nr_pages(folio) - 1 > end) + if (folio_next_index(folio) - 1 > end) continue; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); @@ -2455,7 +2455,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) if (mpd->map.m_len == 0) mpd->first_page = folio->index; - mpd->next_page = folio->index + folio_nr_pages(folio); + mpd->next_page = folio_next_index(folio); /* * Writeout when we cannot modify metadata is simple. * Just submit the page. For data=journal mode we @@ -3986,7 +3986,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) if (IS_SYNC(inode)) ext4_handle_sync(handle); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); ret2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret2)) ret = ret2; @@ -4146,7 +4146,7 @@ out_stop: if (inode->i_nlink) ext4_orphan_del(handle, inode); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); err2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(err2 && !err)) err = err2; @@ -4249,7 +4249,7 @@ static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode } raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); - EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); + EXT4_INODE_SET_CTIME(inode, raw_inode); EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); @@ -4858,7 +4858,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, } } - EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); + EXT4_INODE_GET_CTIME(inode, raw_inode); EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); @@ -4981,7 +4981,7 @@ static void __ext4_update_other_inode_time(struct super_block *sb, spin_unlock(&inode->i_lock); spin_lock(&ei->i_raw_lock); - EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); + EXT4_INODE_SET_CTIME(inode, raw_inode); EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); ext4_inode_csum_set(inode, raw_inode, ei); @@ -5376,10 +5376,8 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, * Update c/mtime on truncate up, ext4_truncate() will * update c/mtime in shrink case below */ - if (!shrink) { - inode->i_mtime = current_time(inode); - inode->i_ctime = inode->i_mtime; - } + if (!shrink) + inode->i_mtime = inode_set_ctime_current(inode); if (shrink) ext4_fc_track_range(handle, inode, @@ -5537,7 +5535,7 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, STATX_ATTR_NODUMP | STATX_ATTR_VERITY); - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); return 0; } @@ -6140,7 +6138,7 @@ retry_alloc: if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_alloc; out_ret: - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); out: filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(inode->i_sb); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 331859511f80..b0349f451863 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -449,7 +449,8 @@ static long swap_inode_boot_loader(struct super_block *sb, diff = size - size_bl; swap_inode_data(inode, inode_bl); - inode->i_ctime = inode_bl->i_ctime = current_time(inode); + inode_set_ctime_current(inode); + inode_set_ctime_current(inode_bl); inode_inc_iversion(inode); inode->i_generation = get_random_u32(); @@ -663,7 +664,7 @@ static int ext4_ioctl_setflags(struct inode *inode, ext4_set_inode_flags(inode, false); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_iversion(inode); err = ext4_mark_iloc_dirty(handle, inode, &iloc); @@ -774,7 +775,7 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) } EXT4_I(inode)->i_projid = kprojid; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_iversion(inode); out_dirty: rc = ext4_mark_iloc_dirty(handle, inode, &iloc); @@ -1266,7 +1267,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } err = ext4_reserve_inode_write(handle, inode, &iloc); if (err == 0) { - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_iversion(inode); inode->i_generation = generation; err = ext4_mark_iloc_dirty(handle, inode, &iloc); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a2475b8c9fb5..21b903fe546e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1006,14 +1006,11 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context * fls() instead since we need to know the actual length while modifying * goal length. */ - order = fls(ac->ac_g_ex.fe_len); + order = fls(ac->ac_g_ex.fe_len) - 1; min_order = order - sbi->s_mb_best_avail_max_trim_order; if (min_order < 0) min_order = 0; - if (1 << min_order < ac->ac_o_ex.fe_len) - min_order = fls(ac->ac_o_ex.fe_len) + 1; - if (sbi->s_stripe > 0) { /* * We are assuming that stripe size is always a multiple of @@ -1021,9 +1018,16 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context */ num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe); if (1 << min_order < num_stripe_clusters) - min_order = fls(num_stripe_clusters); + /* + * We consider 1 order less because later we round + * up the goal len to num_stripe_clusters + */ + min_order = fls(num_stripe_clusters) - 1; } + if (1 << min_order < ac->ac_o_ex.fe_len) + min_order = fls(ac->ac_o_ex.fe_len); + for (i = order; i >= min_order; i--) { int frag_order; /* @@ -4761,8 +4765,8 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) int order, i; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; - struct ext4_prealloc_space *tmp_pa, *cpa = NULL; - ext4_lblk_t tmp_pa_start, tmp_pa_end; + struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL; + loff_t tmp_pa_end; struct rb_node *iter; ext4_fsblk_t goal_block; @@ -4770,47 +4774,151 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return false; - /* first, try per-file preallocation */ + /* + * first, try per-file preallocation by searching the inode pa rbtree. + * + * Here, we can't do a direct traversal of the tree because + * ext4_mb_discard_group_preallocation() can paralelly mark the pa + * deleted and that can cause direct traversal to skip some entries. + */ read_lock(&ei->i_prealloc_lock); + + if (RB_EMPTY_ROOT(&ei->i_prealloc_node)) { + goto try_group_pa; + } + + /* + * Step 1: Find a pa with logical start immediately adjacent to the + * original logical start. This could be on the left or right. + * + * (tmp_pa->pa_lstart never changes so we can skip locking for it). + */ for (iter = ei->i_prealloc_node.rb_node; iter; iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, - tmp_pa_start, iter)) { + tmp_pa->pa_lstart, iter)) { tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); + } - /* all fields in this condition don't change, - * so we can skip locking for them */ - tmp_pa_start = tmp_pa->pa_lstart; - tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); - - /* original request start doesn't lie in this PA */ - if (ac->ac_o_ex.fe_logical < tmp_pa_start || - ac->ac_o_ex.fe_logical >= tmp_pa_end) - continue; + /* + * Step 2: The adjacent pa might be to the right of logical start, find + * the left adjacent pa. After this step we'd have a valid tmp_pa whose + * logical start is towards the left of original request's logical start + */ + if (tmp_pa->pa_lstart > ac->ac_o_ex.fe_logical) { + struct rb_node *tmp; + tmp = rb_prev(&tmp_pa->pa_node.inode_node); - /* non-extent files can't have physical blocks past 2^32 */ - if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && - (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) > - EXT4_MAX_BLOCK_FILE_PHYS)) { + if (tmp) { + tmp_pa = rb_entry(tmp, struct ext4_prealloc_space, + pa_node.inode_node); + } else { /* - * Since PAs don't overlap, we won't find any - * other PA to satisfy this. + * If there is no adjacent pa to the left then finding + * an overlapping pa is not possible hence stop searching + * inode pa tree */ - break; + goto try_group_pa; } + } + + BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical)); - /* found preallocated blocks, use them */ + /* + * Step 3: If the left adjacent pa is deleted, keep moving left to find + * the first non deleted adjacent pa. After this step we should have a + * valid tmp_pa which is guaranteed to be non deleted. + */ + for (iter = &tmp_pa->pa_node.inode_node;; iter = rb_prev(iter)) { + if (!iter) { + /* + * no non deleted left adjacent pa, so stop searching + * inode pa tree + */ + goto try_group_pa; + } + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); spin_lock(&tmp_pa->pa_lock); - if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free && - likely(ext4_mb_pa_goal_check(ac, tmp_pa))) { - atomic_inc(&tmp_pa->pa_count); - ext4_mb_use_inode_pa(ac, tmp_pa); + if (tmp_pa->pa_deleted == 0) { + /* + * We will keep holding the pa_lock from + * this point on because we don't want group discard + * to delete this pa underneath us. Since group + * discard is anyways an ENOSPC operation it + * should be okay for it to wait a few more cycles. + */ + break; + } else { spin_unlock(&tmp_pa->pa_lock); - read_unlock(&ei->i_prealloc_lock); - return true; } + } + + BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical)); + BUG_ON(tmp_pa->pa_deleted == 1); + + /* + * Step 4: We now have the non deleted left adjacent pa. Only this + * pa can possibly satisfy the request hence check if it overlaps + * original logical start and stop searching if it doesn't. + */ + tmp_pa_end = (loff_t)tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + + if (ac->ac_o_ex.fe_logical >= tmp_pa_end) { spin_unlock(&tmp_pa->pa_lock); + goto try_group_pa; + } + + /* non-extent files can't have physical blocks past 2^32 */ + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && + (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) > + EXT4_MAX_BLOCK_FILE_PHYS)) { + /* + * Since PAs don't overlap, we won't find any other PA to + * satisfy this. + */ + spin_unlock(&tmp_pa->pa_lock); + goto try_group_pa; + } + + if (tmp_pa->pa_free && likely(ext4_mb_pa_goal_check(ac, tmp_pa))) { + atomic_inc(&tmp_pa->pa_count); + ext4_mb_use_inode_pa(ac, tmp_pa); + spin_unlock(&tmp_pa->pa_lock); + read_unlock(&ei->i_prealloc_lock); + return true; + } else { + /* + * We found a valid overlapping pa but couldn't use it because + * it had no free blocks. This should ideally never happen + * because: + * + * 1. When a new inode pa is added to rbtree it must have + * pa_free > 0 since otherwise we won't actually need + * preallocation. + * + * 2. An inode pa that is in the rbtree can only have it's + * pa_free become zero when another thread calls: + * ext4_mb_new_blocks + * ext4_mb_use_preallocated + * ext4_mb_use_inode_pa + * + * 3. Further, after the above calls make pa_free == 0, we will + * immediately remove it from the rbtree in: + * ext4_mb_new_blocks + * ext4_mb_release_context + * ext4_mb_put_pa + * + * 4. Since the pa_free becoming 0 and pa_free getting removed + * from tree both happen in ext4_mb_new_blocks, which is always + * called with i_data_sem held for data allocations, we can be + * sure that another process will never see a pa in rbtree with + * pa_free == 0. + */ + WARN_ON_ONCE(tmp_pa->pa_free == 0); } + spin_unlock(&tmp_pa->pa_lock); +try_group_pa: read_unlock(&ei->i_prealloc_lock); /* can we use group allocation? */ diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index b5af2fc03b2f..18a9e7c47975 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -340,10 +340,8 @@ again: ext4_double_up_write_data_sem(orig_inode, donor_inode); goto data_copy; } - if ((folio_has_private(folio[0]) && - !filemap_release_folio(folio[0], 0)) || - (folio_has_private(folio[1]) && - !filemap_release_folio(folio[1], 0))) { + if (!filemap_release_folio(folio[0], 0) || + !filemap_release_folio(folio[1], 0)) { *err = -EBUSY; goto drop_data_sem; } @@ -362,10 +360,8 @@ data_copy: /* At this point all buffers in range are uptodate, old mapping layout * is no longer required, try to drop it now. */ - if ((folio_has_private(folio[0]) && - !filemap_release_folio(folio[0], 0)) || - (folio_has_private(folio[1]) && - !filemap_release_folio(folio[1], 0))) { + if (!filemap_release_folio(folio[0], 0) || + !filemap_release_folio(folio[1], 0)) { *err = -EBUSY; goto unlock_folios; } @@ -392,14 +388,11 @@ data_copy: for (i = 0; i < block_len_in_page; i++) { *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); if (*err < 0) - break; + goto repair_branches; bh = bh->b_this_page; } - if (!*err) - *err = block_commit_write(&folio[0]->page, from, from + replaced_size); - if (unlikely(*err < 0)) - goto repair_branches; + block_commit_write(&folio[0]->page, from, from + replaced_size); /* Even in case of data=writeback it is reasonable to pin * inode to transaction, to prevent unexpected data loss */ diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 0caf6c730ce3..933ad03f4f58 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2203,7 +2203,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, * happen is that the times are slightly out of date * and/or different from the directory change time. */ - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); ext4_update_dx_flag(dir); inode_inc_iversion(dir); err2 = ext4_mark_inode_dirty(handle, dir); @@ -3197,7 +3197,8 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) * recovery. */ inode->i_size = 0; ext4_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + dir->i_mtime = inode_set_ctime_current(dir); + inode_set_ctime_current(inode); retval = ext4_mark_inode_dirty(handle, inode); if (retval) goto end_rmdir; @@ -3271,7 +3272,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name, retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto out_handle; - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); ext4_update_dx_flag(dir); retval = ext4_mark_inode_dirty(handle, dir); if (retval) @@ -3286,7 +3287,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name, drop_nlink(inode); if (!inode->i_nlink) ext4_orphan_add(handle, inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); retval = ext4_mark_inode_dirty(handle, inode); if (dentry && !retval) ext4_fc_track_unlink(handle, dentry); @@ -3463,7 +3464,7 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); ext4_inc_count(inode); ihold(inode); @@ -3641,8 +3642,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, if (ext4_has_feature_filetype(ent->dir->i_sb)) ent->de->file_type = file_type; inode_inc_iversion(ent->dir); - ent->dir->i_ctime = ent->dir->i_mtime = - current_time(ent->dir); + ent->dir->i_mtime = inode_set_ctime_current(ent->dir); retval = ext4_mark_inode_dirty(handle, ent->dir); BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); if (!ent->inlined) { @@ -3941,7 +3941,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old.inode->i_ctime = current_time(old.inode); + inode_set_ctime_current(old.inode); retval = ext4_mark_inode_dirty(handle, old.inode); if (unlikely(retval)) goto end_rename; @@ -3955,9 +3955,9 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (new.inode) { ext4_dec_count(new.inode); - new.inode->i_ctime = current_time(new.inode); + inode_set_ctime_current(new.inode); } - old.dir->i_ctime = old.dir->i_mtime = current_time(old.dir); + old.dir->i_mtime = inode_set_ctime_current(old.dir); ext4_update_dx_flag(old.dir); if (old.dir_bh) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); @@ -4053,7 +4053,6 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, }; u8 new_file_type; int retval; - struct timespec64 ctime; if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) && !projid_eq(EXT4_I(new_dir)->i_projid, @@ -4147,9 +4146,8 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - ctime = current_time(old.inode); - old.inode->i_ctime = ctime; - new.inode->i_ctime = ctime; + inode_set_ctime_current(old.inode); + inode_set_ctime_current(new.inode); retval = ext4_mark_inode_dirty(handle, old.inode); if (unlikely(retval)) goto end_rename; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c94ebf704616..73547d2334fd 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -93,6 +93,7 @@ static int ext4_get_tree(struct fs_context *fc); static int ext4_reconfigure(struct fs_context *fc); static void ext4_fc_free(struct fs_context *fc); static int ext4_init_fs_context(struct fs_context *fc); +static void ext4_kill_sb(struct super_block *sb); static const struct fs_parameter_spec ext4_param_specs[]; /* @@ -135,12 +136,12 @@ static struct file_system_type ext2_fs_type = { .name = "ext2", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, - .kill_sb = kill_block_super, + .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext2"); MODULE_ALIAS("ext2"); -#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type) +#define IS_EXT2_SB(sb) ((sb)->s_type == &ext2_fs_type) #else #define IS_EXT2_SB(sb) (0) #endif @@ -151,12 +152,12 @@ static struct file_system_type ext3_fs_type = { .name = "ext3", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, - .kill_sb = kill_block_super, + .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext3"); MODULE_ALIAS("ext3"); -#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) +#define IS_EXT3_SB(sb) ((sb)->s_type == &ext3_fs_type) static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, @@ -1096,15 +1097,6 @@ void ext4_update_dynamic_rev(struct super_block *sb) */ } -static void ext4_bdev_mark_dead(struct block_device *bdev) -{ - ext4_force_shutdown(bdev->bd_holder, EXT4_GOING_FLAGS_NOLOGFLUSH); -} - -static const struct blk_holder_ops ext4_holder_ops = { - .mark_dead = ext4_bdev_mark_dead, -}; - /* * Open the external journal device */ @@ -1113,7 +1105,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) struct block_device *bdev; bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb, - &ext4_holder_ops); + &fs_holder_ops); if (IS_ERR(bdev)) goto fail; return bdev; @@ -1125,25 +1117,6 @@ fail: return NULL; } -/* - * Release the journal device - */ -static void ext4_blkdev_remove(struct ext4_sb_info *sbi) -{ - struct block_device *bdev; - bdev = sbi->s_journal_bdev; - if (bdev) { - /* - * Invalidate the journal device's buffers. We don't want them - * floating about in memory - the physical journal device may - * hotswapped, and it breaks the `ro-after' testing code. - */ - invalidate_bdev(bdev); - blkdev_put(bdev, sbi->s_sb); - sbi->s_journal_bdev = NULL; - } -} - static inline struct inode *orphan_list_entry(struct list_head *l) { return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode; @@ -1339,8 +1312,13 @@ static void ext4_put_super(struct super_block *sb) sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); if (sbi->s_journal_bdev) { + /* + * Invalidate the journal device's buffers. We don't want them + * floating about in memory - the physical journal device may + * hotswapped, and it breaks the `ro-after' testing code. + */ sync_blockdev(sbi->s_journal_bdev); - ext4_blkdev_remove(sbi); + invalidate_bdev(sbi->s_journal_bdev); } ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); @@ -5572,7 +5550,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) spin_lock_init(&sbi->s_bdev_wb_lock); errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, &sbi->s_bdev_wb_err); - sb->s_bdev->bd_super = sb; EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; @@ -5664,9 +5641,11 @@ failed_mount: kfree(get_qf_name(sb, sbi, i)); #endif fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); - /* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */ brelse(sbi->s_sbh); - ext4_blkdev_remove(sbi); + if (sbi->s_journal_bdev) { + invalidate_bdev(sbi->s_journal_bdev); + blkdev_put(sbi->s_journal_bdev, sb); + } out_fail: invalidate_bdev(sb->s_bdev); sb->s_fs_info = NULL; @@ -5854,7 +5833,10 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) return NULL; + /* see get_tree_bdev why this is needed and safe */ + up_write(&sb->s_umount); bdev = ext4_blkdev_get(j_dev, sb); + down_write(&sb->s_umount); if (bdev == NULL) return NULL; @@ -7103,7 +7085,7 @@ static int ext4_quota_off(struct super_block *sb, int type) } EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL); inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); out_unlock: @@ -7273,13 +7255,24 @@ static inline int ext3_feature_set_ok(struct super_block *sb) return 1; } +static void ext4_kill_sb(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct block_device *journal_bdev = sbi ? sbi->s_journal_bdev : NULL; + + kill_block_super(sb); + + if (journal_bdev) + blkdev_put(journal_bdev, sb); +} + static struct file_system_type ext4_fs_type = { .owner = THIS_MODULE, .name = "ext4", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .kill_sb = ext4_kill_sb, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("ext4"); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 321e3a888c20..281e1bfbbe3e 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -356,13 +356,13 @@ ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) { - return ((u64)ea_inode->i_ctime.tv_sec << 32) | + return ((u64) inode_get_ctime(ea_inode).tv_sec << 32) | (u32) inode_peek_iversion_raw(ea_inode); } static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count) { - ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32); + inode_set_ctime(ea_inode, (u32)(ref_count >> 32), 0); inode_set_iversion_raw(ea_inode, ref_count & 0xffffffff); } @@ -1782,6 +1782,20 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, memmove(here, (void *)here + size, (void *)last - (void *)here + sizeof(__u32)); memset(last, 0, size); + + /* + * Update i_inline_off - moved ibody region might contain + * system.data attribute. Handling a failure here won't + * cause other complications for setting an xattr. + */ + if (!is_block && ext4_has_inline_data(inode)) { + ret = ext4_find_inline_data_nolock(inode); + if (ret) { + ext4_warning_inode(inode, + "unable to update i_inline_off"); + goto out; + } + } } else if (s->not_found) { /* Insert new name. */ size_t size = EXT4_XATTR_LEN(name_len); @@ -2459,7 +2473,7 @@ retry_inode: } if (!error) { ext4_xattr_update_super_block(handle, inode->i_sb); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_iversion(inode); if (!value) no_expand = 0; |