summaryrefslogtreecommitdiff
path: root/fs/ext4
AgeCommit message (Collapse)AuthorFilesLines
2023-01-07ext4: allocate extended attribute value in vmalloc areaYe Bin1-2/+2
commit cc12a6f25e07ed05d5825a1664b67a970842b2ca upstream. Now, extended attribute value maximum length is 64K. The memory requested here does not need continuous physical addresses, so it is appropriate to use kvmalloc to request memory. At the same time, it can also cope with the situation that the extended attribute will become longer in the future. Signed-off-by: Ye Bin <yebin10@huawei.com> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20221208023233.1231330-3-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: avoid unaccounted block allocation when expanding inodeJan Kara1-0/+8
commit 8994d11395f8165b3deca1971946f549f0822630 upstream. When expanding inode space in ext4_expand_extra_isize_ea() we may need to allocate external xattr block. If quota is not initialized for the inode, the block allocation will not be accounted into quota usage. Make sure the quota is initialized before we try to expand inode space. Reported-by: Pengfei Xu <pengfei.xu@intel.com> Link: https://lore.kernel.org/all/Y5BT+k6xWqthZc1P@xpf.sh.intel.com Signed-off-by: Jan Kara <jack@suse.cz> Cc: stable@kernel.org Link: https://lore.kernel.org/r/20221207115937.26601-2-jack@suse.cz Signed-off-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: initialize quota before expanding inode in setproject ioctlJan Kara1-4/+4
commit 1485f726c6dec1a1f85438f2962feaa3d585526f upstream. Make sure we initialize quotas before possibly expanding inode space (and thus maybe needing to allocate external xattr block) in ext4_ioctl_setproject(). This prevents not accounting the necessary block allocation. Signed-off-by: Jan Kara <jack@suse.cz> Cc: stable@kernel.org Link: https://lore.kernel.org/r/20221207115937.26601-1-jack@suse.cz Signed-off-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: fix inode leak in ext4_xattr_inode_create() on an error pathYe Bin1-0/+3
commit e4db04f7d3dbbe16680e0ded27ea2a65b10f766a upstream. There is issue as follows when do setxattr with inject fault: [localhost]# fsck.ext4 -fn /dev/sda e2fsck 1.46.6-rc1 (12-Sep-2022) Pass 1: Checking inodes, blocks, and sizes Pass 2: Checking directory structure Pass 3: Checking directory connectivity Pass 4: Checking reference counts Unattached zero-length inode 15. Clear? no Unattached inode 15 Connect to /lost+found? no Pass 5: Checking group summary information /dev/sda: ********** WARNING: Filesystem still has errors ********** /dev/sda: 15/655360 files (0.0% non-contiguous), 66755/2621440 blocks This occurs in 'ext4_xattr_inode_create()'. If 'ext4_mark_inode_dirty()' fails, dropping i_nlink of the inode is needed. Or will lead to inode leak. Signed-off-by: Ye Bin <yebin10@huawei.com> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20221208023233.1231330-5-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: fix kernel BUG in 'ext4_write_inline_data_end()'Ye Bin1-1/+2
commit 5c099c4fdc438014d5893629e70a8ba934433ee8 upstream. Syzbot report follow issue: ------------[ cut here ]------------ kernel BUG at fs/ext4/inline.c:227! invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 3629 Comm: syz-executor212 Not tainted 6.1.0-rc5-syzkaller-00018-g59d0d52c30d4 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022 RIP: 0010:ext4_write_inline_data+0x344/0x3e0 fs/ext4/inline.c:227 RSP: 0018:ffffc90003b3f368 EFLAGS: 00010293 RAX: 0000000000000000 RBX: ffff8880704e16c0 RCX: 0000000000000000 RDX: ffff888021763a80 RSI: ffffffff821e31a4 RDI: 0000000000000006 RBP: 000000000006818e R08: 0000000000000006 R09: 0000000000068199 R10: 0000000000000079 R11: 0000000000000000 R12: 000000000000000b R13: 0000000000068199 R14: ffffc90003b3f408 R15: ffff8880704e1c82 FS: 000055555723e3c0(0000) GS:ffff8880b9b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fffe8ac9080 CR3: 0000000079f81000 CR4: 0000000000350ee0 Call Trace: <TASK> ext4_write_inline_data_end+0x2a3/0x12f0 fs/ext4/inline.c:768 ext4_write_end+0x242/0xdd0 fs/ext4/inode.c:1313 ext4_da_write_end+0x3ed/0xa30 fs/ext4/inode.c:3063 generic_perform_write+0x316/0x570 mm/filemap.c:3764 ext4_buffered_write_iter+0x15b/0x460 fs/ext4/file.c:285 ext4_file_write_iter+0x8bc/0x16e0 fs/ext4/file.c:700 call_write_iter include/linux/fs.h:2191 [inline] do_iter_readv_writev+0x20b/0x3b0 fs/read_write.c:735 do_iter_write+0x182/0x700 fs/read_write.c:861 vfs_iter_write+0x74/0xa0 fs/read_write.c:902 iter_file_splice_write+0x745/0xc90 fs/splice.c:686 do_splice_from fs/splice.c:764 [inline] direct_splice_actor+0x114/0x180 fs/splice.c:931 splice_direct_to_actor+0x335/0x8a0 fs/splice.c:886 do_splice_direct+0x1ab/0x280 fs/splice.c:974 do_sendfile+0xb19/0x1270 fs/read_write.c:1255 __do_sys_sendfile64 fs/read_write.c:1323 [inline] __se_sys_sendfile64 fs/read_write.c:1309 [inline] __x64_sys_sendfile64+0x1d0/0x210 fs/read_write.c:1309 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd ---[ end trace 0000000000000000 ]--- Above issue may happens as follows: ext4_da_write_begin ext4_da_write_inline_data_begin ext4_da_convert_inline_data_to_extent ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ext4_da_write_end ext4_run_li_request ext4_mb_prefetch ext4_read_block_bitmap_nowait ext4_validate_block_bitmap ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT) percpu_counter_sub(&sbi->s_freeclusters_counter,grp->bb_free); -> sbi->s_freeclusters_counter become zero ext4_da_write_begin if (ext4_nonda_switch(inode->i_sb)) -> As freeclusters_counter is zero will return true *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; ext4_write_begin ext4_da_write_end if (write_mode == FALL_BACK_TO_NONDELALLOC) ext4_write_end if (inline_data) ext4_write_inline_data_end ext4_write_inline_data BUG_ON(pos + len > EXT4_I(inode)->i_inline_size); -> As inode is already convert to extent, so 'pos + len' > inline_size -> then trigger BUG. To solve this issue, instead of checking ext4_has_inline_data() which is only cleared after data has been written back, check the EXT4_STATE_MAY_INLINE_DATA flag in ext4_write_end(). Fixes: f19d5870cbf7 ("ext4: add normal write support for inline data") Reported-by: syzbot+4faa160fa96bfba639f8@syzkaller.appspotmail.com Reported-by: Jun Nie <jun.nie@linaro.org> Signed-off-by: Ye Bin <yebin10@huawei.com> Link: https://lore.kernel.org/r/20221206144134.1919987-1-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: fix deadlock due to mbcache entry corruptionJan Kara1-2/+2
commit a44e84a9b7764c72896f7241a0ec9ac7e7ef38dd upstream. When manipulating xattr blocks, we can deadlock infinitely looping inside ext4_xattr_block_set() where we constantly keep finding xattr block for reuse in mbcache but we are unable to reuse it because its reference count is too big. This happens because cache entry for the xattr block is marked as reusable (e_reusable set) although its reference count is too big. When this inconsistency happens, this inconsistent state is kept indefinitely and so ext4_xattr_block_set() keeps retrying indefinitely. The inconsistent state is caused by non-atomic update of e_reusable bit. e_reusable is part of a bitfield and e_reusable update can race with update of e_referenced bit in the same bitfield resulting in loss of one of the updates. Fix the problem by using atomic bitops instead. This bug has been around for many years, but it became *much* easier to hit after commit 65f8b80053a1 ("ext4: fix race when reusing xattr blocks"). Cc: stable@vger.kernel.org Fixes: 6048c64b2609 ("mbcache: add reusable flag to cache entries") Fixes: 65f8b80053a1 ("ext4: fix race when reusing xattr blocks") Reported-and-tested-by: Jeremi Piotrowski <jpiotrowski@linux.microsoft.com> Reported-by: Thilo Fromm <t-lo@linux.microsoft.com> Link: https://lore.kernel.org/r/c77bf00f-4618-7149-56f1-b8d1664b9d07@linux.microsoft.com/ Signed-off-by: Jan Kara <jack@suse.cz> Reviewed-by: Andreas Dilger <adilger@dilger.ca> Link: https://lore.kernel.org/r/20221123193950.16758-1-jack@suse.cz Signed-off-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: avoid BUG_ON when creating xattrsJan Kara1-8/+0
commit b40ebaf63851b3a401b0dc9263843538f64f5ce6 upstream. Commit fb0a387dcdcd ("ext4: limit block allocations for indirect-block files to < 2^32") added code to try to allocate xattr block with 32-bit block number for indirect block based files on the grounds that these files cannot use larger block numbers. It also added BUG_ON when allocated block could not fit into 32 bits. This is however bogus reasoning because xattr block is stored in inode->i_file_acl and inode->i_file_acl_hi and as such even indirect block based files can happily use full 48 bits for xattr block number. The proper handling seems to be there basically since 64-bit block number support was added. So remove the bogus limitation and BUG_ON. Cc: Eric Sandeen <sandeen@redhat.com> Fixes: fb0a387dcdcd ("ext4: limit block allocations for indirect-block files to < 2^32") Signed-off-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20221121130929.32031-1-jack@suse.cz Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: fix corrupt backup group descriptors after online resizeBaokun Li1-7/+15
commit 8f49ec603ae3e213bfab2799182724e3abac55a1 upstream. In commit 9a8c5b0d0615 ("ext4: update the backup superblock's at the end of the online resize"), it is assumed that update_backups() only updates backup superblocks, so each b_data is treated as a backupsuper block to update its s_block_group_nr and s_checksum. However, update_backups() also updates the backup group descriptors, which causes the backup group descriptors to be corrupted. The above commit fixes the problem of invalid checksum of the backup superblock. The root cause of this problem is that the checksum of ext4_update_super() is not set correctly. This problem has been fixed in the previous patch ("ext4: fix bad checksum after online resize"). However, we do need to set block_group_nr for the backup superblock in update_backups(). When a block is in a group that contains a backup superblock, and the block is the first block in the group, the block is definitely a superblock. We add a helper function that includes setting s_block_group_nr and updating checksum, and then call it only when the above conditions are met to prevent the backup group descriptors from being incorrectly modified. Fixes: 9a8c5b0d0615 ("ext4: update the backup superblock's at the end of the online resize") Signed-off-by: Baokun Li <libaokun1@huawei.com> Reviewed-by: Jan Kara <jack@suse.cz> Cc: stable@kernel.org Link: https://lore.kernel.org/r/20221117040341.1380702-3-libaokun1@huawei.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: dont return EINVAL from GETFSUUID when reporting UUID lengthDarrick J. Wong1-2/+3
commit b76abb5157468756163fe7e3431c9fe32cba57ca upstream. If userspace calls this ioctl with fsu_length (the length of the fsuuid.fsu_uuid array) set to zero, ext4 copies the desired uuid length out to userspace. The kernel call returned a result from a valid input, so the return value here should be zero, not EINVAL. While we're at it, fix the copy_to_user call to make it clear that we're only copying out fsu_len. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Catherine Hoang <catherine.hoang@oracle.com> Link: https://lore.kernel.org/r/166811138914.327006.9241306894437166566.stgit@magnolia Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: fix bad checksum after online resizeBaokun Li1-2/+2
commit a408f33e895e455f16cf964cb5cd4979b658db7b upstream. When online resizing is performed twice consecutively, the error message "Superblock checksum does not match superblock" is displayed for the second time. Here's the reproducer: mkfs.ext4 -F /dev/sdb 100M mount /dev/sdb /tmp/test resize2fs /dev/sdb 5G resize2fs /dev/sdb 6G To solve this issue, we moved the update of the checksum after the es->s_overhead_clusters is updated. Fixes: 026d0d27c488 ("ext4: reduce computation of overhead during resize") Fixes: de394a86658f ("ext4: update s_overhead_clusters in the superblock during an on-line resize") Signed-off-by: Baokun Li <libaokun1@huawei.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Jan Kara <jack@suse.cz> Cc: stable@kernel.org Link: https://lore.kernel.org/r/20221117040341.1380702-2-libaokun1@huawei.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: fix error code return to user-space in ext4_get_branch()Luís Henriques1-1/+8
commit 26d75a16af285a70863ba6a81f85d81e7e65da50 upstream. If a block is out of range in ext4_get_branch(), -ENOMEM will be returned to user-space. Obviously, this error code isn't really useful. This patch fixes it by making sure the right error code (-EFSCORRUPTED) is propagated to user-space. EUCLEAN is more informative than ENOMEM. Signed-off-by: Luís Henriques <lhenriques@suse.de> Link: https://lore.kernel.org/r/20221109181445.17843-1-lhenriques@suse.de Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: fix corruption when online resizing a 1K bigalloc fsBaokun Li1-3/+3
commit 0aeaa2559d6d53358fca3e3fce73807367adca74 upstream. When a backup superblock is updated in update_backups(), the primary superblock's offset in the group (that is, sbi->s_sbh->b_blocknr) is used as the backup superblock's offset in its group. However, when the block size is 1K and bigalloc is enabled, the two offsets are not equal. This causes the backup group descriptors to be overwritten by the superblock in update_backups(). Moreover, if meta_bg is enabled, the file system will be corrupted because this feature uses backup group descriptors. To solve this issue, we use a more accurate ext4_group_first_block_no() as the offset of the backup superblock in its group. Fixes: d77147ff443b ("ext4: add support for online resizing with bigalloc") Signed-off-by: Baokun Li <libaokun1@huawei.com> Reviewed-by: Jan Kara <jack@suse.cz> Cc: stable@kernel.org Link: https://lore.kernel.org/r/20221117040341.1380702-4-libaokun1@huawei.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: fix delayed allocation bug in ext4_clu_mapped for bigalloc + inlineEric Whitney1-0/+8
commit 131294c35ed6f777bd4e79d42af13b5c41bf2775 upstream. When converting files with inline data to extents, delayed allocations made on a file system created with both the bigalloc and inline options can result in invalid extent status cache content, incorrect reserved cluster counts, kernel memory leaks, and potential kernel panics. With bigalloc, the code that determines whether a block must be delayed allocated searches the extent tree to see if that block maps to a previously allocated cluster. If not, the block is delayed allocated, and otherwise, it isn't. However, if the inline option is also used, and if the file containing the block is marked as able to store data inline, there isn't a valid extent tree associated with the file. The current code in ext4_clu_mapped() calls ext4_find_extent() to search the non-existent tree for a previously allocated cluster anyway, which typically finds nothing, as desired. However, a side effect of the search can be to cache invalid content from the non-existent tree (garbage) in the extent status tree, including bogus entries in the pending reservation tree. To fix this, avoid searching the extent tree when allocating blocks for bigalloc + inline files that are being converted from inline to extent mapped. Signed-off-by: Eric Whitney <enwlinux@gmail.com> Link: https://lore.kernel.org/r/20221117152207.2424-1-enwlinux@gmail.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: don't fail GETFSUUID when the caller provides a long bufferDarrick J. Wong1-2/+4
commit a7e9d977e031fceefe1e7cd69ebd7202d5758b56 upstream. If userspace provides a longer UUID buffer than is required, we shouldn't fail the call with EINVAL -- rather, we can fill the caller's buffer with the bytes we /can/ fill, and update the length field to reflect what we copied. This doesn't break the UAPI since we're enabling a case that currently fails, and so far Ted hasn't released a version of e2fsprogs that uses the new ext4 ioctl. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Catherine Hoang <catherine.hoang@oracle.com> Link: https://lore.kernel.org/r/166811139478.327006.13879198441587445544.stgit@magnolia Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: init quota for 'old.inode' in 'ext4_rename'Ye Bin1-0/+3
commit fae381a3d79bb94aa2eb752170d47458d778b797 upstream. Syzbot found the following issue: ext4_parse_param: s_want_extra_isize=128 ext4_inode_info_init: s_want_extra_isize=32 ext4_rename: old.inode=ffff88823869a2c8 old.dir=ffff888238699828 new.inode=ffff88823869d7e8 new.dir=ffff888238699828 __ext4_mark_inode_dirty: inode=ffff888238699828 ea_isize=32 want_ea_size=128 __ext4_mark_inode_dirty: inode=ffff88823869a2c8 ea_isize=32 want_ea_size=128 ext4_xattr_block_set: inode=ffff88823869a2c8 ------------[ cut here ]------------ WARNING: CPU: 13 PID: 2234 at fs/ext4/xattr.c:2070 ext4_xattr_block_set.cold+0x22/0x980 Modules linked in: RIP: 0010:ext4_xattr_block_set.cold+0x22/0x980 RSP: 0018:ffff888227d3f3b0 EFLAGS: 00010202 RAX: 0000000000000001 RBX: ffff88823007a000 RCX: 0000000000000000 RDX: 0000000000000a03 RSI: 0000000000000040 RDI: ffff888230078178 RBP: 0000000000000000 R08: 000000000000002c R09: ffffed1075c7df8e R10: ffff8883ae3efc6b R11: ffffed1075c7df8d R12: 0000000000000000 R13: ffff88823869a2c8 R14: ffff8881012e0460 R15: dffffc0000000000 FS: 00007f350ac1f740(0000) GS:ffff8883ae200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f350a6ed6a0 CR3: 0000000237456000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <TASK> ? ext4_xattr_set_entry+0x3b7/0x2320 ? ext4_xattr_block_set+0x0/0x2020 ? ext4_xattr_set_entry+0x0/0x2320 ? ext4_xattr_check_entries+0x77/0x310 ? ext4_xattr_ibody_set+0x23b/0x340 ext4_xattr_move_to_block+0x594/0x720 ext4_expand_extra_isize_ea+0x59a/0x10f0 __ext4_expand_extra_isize+0x278/0x3f0 __ext4_mark_inode_dirty.cold+0x347/0x410 ext4_rename+0xed3/0x174f vfs_rename+0x13a7/0x2510 do_renameat2+0x55d/0x920 __x64_sys_rename+0x7d/0xb0 do_syscall_64+0x3b/0xa0 entry_SYSCALL_64_after_hwframe+0x72/0xdc As 'ext4_rename' will modify 'old.inode' ctime and mark inode dirty, which may trigger expand 'extra_isize' and allocate block. If inode didn't init quota will lead to warning. To solve above issue, init 'old.inode' firstly in 'ext4_rename'. Reported-by: syzbot+98346927678ac3059c77@syzkaller.appspotmail.com Signed-off-by: Ye Bin <yebin10@huawei.com> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20221107015335.2524319-1-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: fix uninititialized value in 'ext4_evict_inode'Ye Bin1-0/+1
commit 7ea71af94eaaaf6d9aed24bc94a05b977a741cb9 upstream. Syzbot found the following issue: ===================================================== BUG: KMSAN: uninit-value in ext4_evict_inode+0xdd/0x26b0 fs/ext4/inode.c:180 ext4_evict_inode+0xdd/0x26b0 fs/ext4/inode.c:180 evict+0x365/0x9a0 fs/inode.c:664 iput_final fs/inode.c:1747 [inline] iput+0x985/0xdd0 fs/inode.c:1773 __ext4_new_inode+0xe54/0x7ec0 fs/ext4/ialloc.c:1361 ext4_mknod+0x376/0x840 fs/ext4/namei.c:2844 vfs_mknod+0x79d/0x830 fs/namei.c:3914 do_mknodat+0x47d/0xaa0 __do_sys_mknodat fs/namei.c:3992 [inline] __se_sys_mknodat fs/namei.c:3989 [inline] __ia32_sys_mknodat+0xeb/0x150 fs/namei.c:3989 do_syscall_32_irqs_on arch/x86/entry/common.c:112 [inline] __do_fast_syscall_32+0xa2/0x100 arch/x86/entry/common.c:178 do_fast_syscall_32+0x33/0x70 arch/x86/entry/common.c:203 do_SYSENTER_32+0x1b/0x20 arch/x86/entry/common.c:246 entry_SYSENTER_compat_after_hwframe+0x70/0x82 Uninit was created at: __alloc_pages+0x9f1/0xe80 mm/page_alloc.c:5578 alloc_pages+0xaae/0xd80 mm/mempolicy.c:2285 alloc_slab_page mm/slub.c:1794 [inline] allocate_slab+0x1b5/0x1010 mm/slub.c:1939 new_slab mm/slub.c:1992 [inline] ___slab_alloc+0x10c3/0x2d60 mm/slub.c:3180 __slab_alloc mm/slub.c:3279 [inline] slab_alloc_node mm/slub.c:3364 [inline] slab_alloc mm/slub.c:3406 [inline] __kmem_cache_alloc_lru mm/slub.c:3413 [inline] kmem_cache_alloc_lru+0x6f3/0xb30 mm/slub.c:3429 alloc_inode_sb include/linux/fs.h:3117 [inline] ext4_alloc_inode+0x5f/0x860 fs/ext4/super.c:1321 alloc_inode+0x83/0x440 fs/inode.c:259 new_inode_pseudo fs/inode.c:1018 [inline] new_inode+0x3b/0x430 fs/inode.c:1046 __ext4_new_inode+0x2a7/0x7ec0 fs/ext4/ialloc.c:959 ext4_mkdir+0x4d5/0x1560 fs/ext4/namei.c:2992 vfs_mkdir+0x62a/0x870 fs/namei.c:4035 do_mkdirat+0x466/0x7b0 fs/namei.c:4060 __do_sys_mkdirat fs/namei.c:4075 [inline] __se_sys_mkdirat fs/namei.c:4073 [inline] __ia32_sys_mkdirat+0xc4/0x120 fs/namei.c:4073 do_syscall_32_irqs_on arch/x86/entry/common.c:112 [inline] __do_fast_syscall_32+0xa2/0x100 arch/x86/entry/common.c:178 do_fast_syscall_32+0x33/0x70 arch/x86/entry/common.c:203 do_SYSENTER_32+0x1b/0x20 arch/x86/entry/common.c:246 entry_SYSENTER_compat_after_hwframe+0x70/0x82 CPU: 1 PID: 4625 Comm: syz-executor.2 Not tainted 6.1.0-rc4-syzkaller-62821-gcb231e2f67ec #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022 ===================================================== Now, 'ext4_alloc_inode()' didn't init 'ei->i_flags'. If new inode failed before set 'ei->i_flags' in '__ext4_new_inode()', then do 'iput()'. As after 6bc0d63dad7f commit will access 'ei->i_flags' in 'ext4_evict_inode()' which will lead to access uninit-value. To solve above issue just init 'ei->i_flags' in 'ext4_alloc_inode()'. Reported-by: syzbot+57b25da729eb0b88177d@syzkaller.appspotmail.com Signed-off-by: Ye Bin <yebin10@huawei.com> Fixes: 6bc0d63dad7f ("ext4: remove EA inode entry from mbcache on inode eviction") Reviewed-by: Jan Kara <jack@suse.cz> Reviewed-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20221117073603.2598882-1-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: fix off-by-one errors in fast-commit block fillingEric Biggers1-33/+33
commit 48a6a66db82b8043d298a630f22c62d43550cae5 upstream. Due to several different off-by-one errors, or perhaps due to a late change in design that wasn't fully reflected in the code that was actually merged, there are several very strange constraints on how fast-commit blocks are filled with tlv entries: - tlvs must start at least 10 bytes before the end of the block, even though the minimum tlv length is 8. Otherwise, the replay code will ignore them. (BUG: ext4_fc_reserve_space() could violate this requirement if called with a len of blocksize - 9 or blocksize - 8. Fortunately, this doesn't seem to happen currently.) - tlvs must end at least 1 byte before the end of the block. Otherwise the replay code will consider them to be invalid. This quirk contributed to a bug (fixed by an earlier commit) where uninitialized memory was being leaked to disk in the last byte of blocks. Also, strangely these constraints don't apply to the replay code in e2fsprogs, which will accept any tlvs in the blocks (with no bounds checks at all, but that is a separate issue...). Given that this all seems to be a bug, let's fix it by just filling blocks with tlv entries in the natural way. Note that old kernels will be unable to replay fast-commit journals created by kernels that have this commit. Fixes: aa75f4d3daae ("ext4: main fast-commit commit path") Cc: <stable@vger.kernel.org> # v5.10+ Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20221106224841.279231-7-ebiggers@kernel.org Signed-off-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: fix unaligned memory access in ext4_fc_reserve_space()Eric Biggers1-18/+21
commit 8415ce07ecf0cc25efdd5db264a7133716e503cf upstream. As is done elsewhere in the file, build the struct ext4_fc_tl on the stack and memcpy() it into the buffer, rather than directly writing it to a potentially-unaligned location in the buffer. Fixes: aa75f4d3daae ("ext4: main fast-commit commit path") Cc: <stable@vger.kernel.org> # v5.10+ Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20221106224841.279231-6-ebiggers@kernel.org Signed-off-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: add missing validation of fast-commit record lengthsEric Biggers2-20/+20
commit 64b4a25c3de81a69724e888ec2db3533b43816e2 upstream. Validate the inode and filename lengths in fast-commit journal records so that a malicious fast-commit journal cannot cause a crash by having invalid values for these. Also validate EXT4_FC_TAG_DEL_RANGE. Fixes: aa75f4d3daae ("ext4: main fast-commit commit path") Cc: <stable@vger.kernel.org> # v5.10+ Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20221106224841.279231-5-ebiggers@kernel.org Signed-off-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: don't set up encryption key during jbd2 transactionEric Biggers3-23/+27
commit 4c0d5778385cb3618ff26a561ce41de2b7d9de70 upstream. Commit a80f7fcf1867 ("ext4: fixup ext4_fc_track_* functions' signature") extended the scope of the transaction in ext4_unlink() too far, making it include the call to ext4_find_entry(). However, ext4_find_entry() can deadlock when called from within a transaction because it may need to set up the directory's encryption key. Fix this by restoring the transaction to its original scope. Reported-by: syzbot+1a748d0007eeac3ab079@syzkaller.appspotmail.com Fixes: a80f7fcf1867 ("ext4: fixup ext4_fc_track_* functions' signature") Cc: <stable@vger.kernel.org> # v5.10+ Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20221106224841.279231-3-ebiggers@kernel.org Signed-off-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: fix leaking uninitialized memory in fast-commit journalEric Biggers1-0/+5
commit 594bc43b410316d70bb42aeff168837888d96810 upstream. When space at the end of fast-commit journal blocks is unused, make sure to zero it out so that uninitialized memory is not leaked to disk. Fixes: aa75f4d3daae ("ext4: main fast-commit commit path") Cc: <stable@vger.kernel.org> # v5.10+ Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20221106224841.279231-4-ebiggers@kernel.org Signed-off-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: disable fast-commit of encrypted dir operationsEric Biggers2-16/+26
commit 0fbcb5251fc81b58969b272c4fb7374a7b922e3e upstream. fast-commit of create, link, and unlink operations in encrypted directories is completely broken because the unencrypted filenames are being written to the fast-commit journal instead of the encrypted filenames. These operations can't be replayed, as encryption keys aren't present at journal replay time. It is also an information leak. Until if/when we can get this working properly, make encrypted directory operations ineligible for fast-commit. Note that fast-commit operations on encrypted regular files continue to be allowed, as they seem to work. Fixes: aa75f4d3daae ("ext4: main fast-commit commit path") Cc: <stable@vger.kernel.org> # v5.10+ Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20221106224841.279231-2-ebiggers@kernel.org Signed-off-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: don't allow journal inode to have encrypt flagEric Biggers1-1/+1
commit 105c78e12468413e426625831faa7db4284e1fec upstream. Mounting a filesystem whose journal inode has the encrypt flag causes a NULL dereference in fscrypt_limit_io_blocks() when the 'inlinecrypt' mount option is used. The problem is that when jbd2_journal_init_inode() calls bmap(), it eventually finds its way into ext4_iomap_begin(), which calls fscrypt_limit_io_blocks(). fscrypt_limit_io_blocks() requires that if the inode is encrypted, then its encryption key must already be set up. That's not the case here, since the journal inode is never "opened" like a normal file would be. Hence the crash. A reproducer is: mkfs.ext4 -F /dev/vdb debugfs -w /dev/vdb -R "set_inode_field <8> flags 0x80808" mount /dev/vdb /mnt -o inlinecrypt To fix this, make ext4 consider journal inodes with the encrypt flag to be invalid. (Note, maybe other flags should be rejected on the journal inode too. For now, this is just the minimal fix for the above issue.) I've marked this as fixing the commit that introduced the call to fscrypt_limit_io_blocks(), since that's what made an actual crash start being possible. But this fix could be applied to any version of ext4 that supports the encrypt feature. Reported-by: syzbot+ba9dac45bc76c490b7c3@syzkaller.appspotmail.com Fixes: 38ea50daa7a4 ("ext4: support direct I/O with fscrypt using blk-crypto") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20221102053312.189962-1-ebiggers@kernel.org Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: fix bug_on in __es_tree_search caused by bad boot loader inodeBaokun Li1-1/+1
commit 991ed014de0840c5dc405b679168924afb2952ac upstream. We got a issue as fllows: ================================================================== kernel BUG at fs/ext4/extents_status.c:203! invalid opcode: 0000 [#1] PREEMPT SMP CPU: 1 PID: 945 Comm: cat Not tainted 6.0.0-next-20221007-dirty #349 RIP: 0010:ext4_es_end.isra.0+0x34/0x42 RSP: 0018:ffffc9000143b768 EFLAGS: 00010203 RAX: 0000000000000000 RBX: ffff8881769cd0b8 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff8fc27cf7 RDI: 00000000ffffffff RBP: ffff8881769cd0bc R08: 0000000000000000 R09: ffffc9000143b5f8 R10: 0000000000000001 R11: 0000000000000001 R12: ffff8881769cd0a0 R13: ffff8881768e5668 R14: 00000000768e52f0 R15: 0000000000000000 FS: 00007f359f7f05c0(0000)GS:ffff88842fd00000(0000)knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f359f5a2000 CR3: 000000017130c000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <TASK> __es_tree_search.isra.0+0x6d/0xf5 ext4_es_cache_extent+0xfa/0x230 ext4_cache_extents+0xd2/0x110 ext4_find_extent+0x5d5/0x8c0 ext4_ext_map_blocks+0x9c/0x1d30 ext4_map_blocks+0x431/0xa50 ext4_mpage_readpages+0x48e/0xe40 ext4_readahead+0x47/0x50 read_pages+0x82/0x530 page_cache_ra_unbounded+0x199/0x2a0 do_page_cache_ra+0x47/0x70 page_cache_ra_order+0x242/0x400 ondemand_readahead+0x1e8/0x4b0 page_cache_sync_ra+0xf4/0x110 filemap_get_pages+0x131/0xb20 filemap_read+0xda/0x4b0 generic_file_read_iter+0x13a/0x250 ext4_file_read_iter+0x59/0x1d0 vfs_read+0x28f/0x460 ksys_read+0x73/0x160 __x64_sys_read+0x1e/0x30 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd </TASK> ================================================================== In the above issue, ioctl invokes the swap_inode_boot_loader function to swap inode<5> and inode<12>. However, inode<5> contain incorrect imode and disordered extents, and i_nlink is set to 1. The extents check for inode in the ext4_iget function can be bypassed bacause 5 is EXT4_BOOT_LOADER_INO. While links_count is set to 1, the extents are not initialized in swap_inode_boot_loader. After the ioctl command is executed successfully, the extents are swapped to inode<12>, in this case, run the `cat` command to view inode<12>. And Bug_ON is triggered due to the incorrect extents. When the boot loader inode is not initialized, its imode can be one of the following: 1) the imode is a bad type, which is marked as bad_inode in ext4_iget and set to S_IFREG. 2) the imode is good type but not S_IFREG. 3) the imode is S_IFREG. The BUG_ON may be triggered by bypassing the check in cases 1 and 2. Therefore, when the boot loader inode is bad_inode or its imode is not S_IFREG, initialize the inode to avoid triggering the BUG. Signed-off-by: Baokun Li <libaokun1@huawei.com> Reviewed-by: Jason Yan <yanaijie@huawei.com> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20221026042310.3839669-5-libaokun1@huawei.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: check and assert if marking an no_delete evicting inode dirtyZhang Yi1-0/+6
commit 318cdc822c63b6e2befcfdc2088378ae6fa18def upstream. In ext4_evict_inode(), if we evicting an inode in the 'no_delete' path, it cannot be raced by another mark_inode_dirty(). If it happens, someone else may accidentally dirty it without holding inode refcount and probably cause use-after-free issues in the writeback procedure. It's indiscoverable and hard to debug, so add an WARN_ON_ONCE() to check and detect this issue in advance. Suggested-by: Jan Kara <jack@suse.cz> Signed-off-by: Zhang Yi <yi.zhang@huawei.com> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20220629112647.4141034-2-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: journal_path mount options should follow linksLukas Czerner1-1/+1
commit e3ea75ee651daf5e434afbfdb7dbf75e200ea1f6 upstream. Before the commit 461c3af045d3 ("ext4: Change handle_mount_opt() to use fs_parameter") ext4 mount option journal_path did follow links in the provided path. Bring this behavior back by allowing to pass pathwalk flags to fs_lookup_param(). Fixes: 461c3af045d3 ("ext4: Change handle_mount_opt() to use fs_parameter") Signed-off-by: Lukas Czerner <lczerner@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Link: https://lore.kernel.org/r/20221004135803.32283-1-lczerner@redhat.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: fix reserved cluster accounting in __es_remove_extent()Ye Bin1-1/+2
commit 1da18e38cb97e9521e93d63034521a9649524f64 upstream. When bigalloc is enabled, reserved cluster accounting for delayed allocation is handled in extent_status.c. With a corrupted file system, it's possible for this accounting to be incorrect, dsicovered by Syzbot: EXT4-fs error (device loop0): ext4_validate_block_bitmap:398: comm rep: bg 0: block 5: invalid block bitmap EXT4-fs (loop0): Delayed block allocation failed for inode 18 at logical offset 0 with max blocks 32 with error 28 EXT4-fs (loop0): This should not happen!! Data will be lost EXT4-fs (loop0): Total free blocks count 0 EXT4-fs (loop0): Free/Dirty block details EXT4-fs (loop0): free_blocks=0 EXT4-fs (loop0): dirty_blocks=32 EXT4-fs (loop0): Block reservation details EXT4-fs (loop0): i_reserved_data_blocks=2 EXT4-fs (loop0): Inode 18 (00000000845cd634): i_reserved_data_blocks (1) not cleared! Above issue happens as follows: Assume: sbi->s_cluster_ratio = 16 Step1: Insert delay block [0, 31] -> ei->i_reserved_data_blocks=2 Step2: ext4_writepages mpage_map_and_submit_extent -> return failed mpage_release_unused_pages -> to release [0, 30] ext4_es_remove_extent -> remove lblk=0 end=30 __es_remove_extent -> len1=0 len2=31-30=1 __es_remove_extent: ... if (len2 > 0) { ... if (len1 > 0) { ... } else { es->es_lblk = end + 1; es->es_len = len2; ... } if (count_reserved) count_rsvd(inode, lblk, ...); goto out; -> will return but didn't calculate 'reserved' ... Step3: ext4_destroy_inode -> trigger "i_reserved_data_blocks (1) not cleared!" To solve above issue if 'len2>0' call 'get_rsvd()' before goto out. Reported-by: syzbot+05a0f0ccab4a25626e38@syzkaller.appspotmail.com Fixes: 8fcc3a580651 ("ext4: rework reserved cluster accounting when invalidating pages") Signed-off-by: Ye Bin <yebin10@huawei.com> Reviewed-by: Eric Whitney <enwlinux@gmail.com> Link: https://lore.kernel.org/r/20221208033426.1832460-2-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: add helper to check quota inumsBaokun Li1-3/+25
commit 07342ec259df2a35d6a34aebce010567a80a0e15 upstream. Before quota is enabled, a check on the preset quota inums in ext4_super_block is added to prevent wrong quota inodes from being loaded. In addition, when the quota fails to be enabled, the quota type and quota inum are printed to facilitate fault locating. Signed-off-by: Baokun Li <libaokun1@huawei.com> Reviewed-by: Jason Yan <yanaijie@huawei.com> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20221026042310.3839669-3-libaokun1@huawei.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: add EXT4_IGET_BAD flag to prevent unexpected bad inodeBaokun Li3-3/+11
commit 63b1e9bccb71fe7d7e3ddc9877dbdc85e5d2d023 upstream. There are many places that will get unhappy (and crash) when ext4_iget() returns a bad inode. However, if iget the boot loader inode, allows a bad inode to be returned, because the inode may not be initialized. This mechanism can be used to bypass some checks and cause panic. To solve this problem, we add a special iget flag EXT4_IGET_BAD. Only with this flag we'd be returning bad inode from ext4_iget(), otherwise we always return the error code if the inode is bad inode.(suggested by Jan Kara) Signed-off-by: Baokun Li <libaokun1@huawei.com> Reviewed-by: Jason Yan <yanaijie@huawei.com> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20221026042310.3839669-4-libaokun1@huawei.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: fix undefined behavior in bit shift for ext4_check_flag_valuesGaosheng Cui1-1/+1
commit 3bf678a0f9c017c9ba7c581541dbc8453452a7ae upstream. Shifting signed 32-bit value by 31 bits is undefined, so changing significant bit to unsigned. The UBSAN warning calltrace like below: UBSAN: shift-out-of-bounds in fs/ext4/ext4.h:591:2 left shift of 1 by 31 places cannot be represented in type 'int' Call Trace: <TASK> dump_stack_lvl+0x7d/0xa5 dump_stack+0x15/0x1b ubsan_epilogue+0xe/0x4e __ubsan_handle_shift_out_of_bounds+0x1e7/0x20c ext4_init_fs+0x5a/0x277 do_one_initcall+0x76/0x430 kernel_init_freeable+0x3b3/0x422 kernel_init+0x24/0x1e0 ret_from_fork+0x1f/0x30 </TASK> Fixes: 9a4c80194713 ("ext4: ensure Inode flags consistency are checked at build time") Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com> Link: https://lore.kernel.org/r/20221031055833.3966222-1-cuigaosheng1@huawei.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: fix use-after-free in ext4_orphan_cleanupBaokun Li1-1/+2
commit a71248b1accb2b42e4980afef4fa4a27fa0e36f5 upstream. I caught a issue as follows: ================================================================== BUG: KASAN: use-after-free in __list_add_valid+0x28/0x1a0 Read of size 8 at addr ffff88814b13f378 by task mount/710 CPU: 1 PID: 710 Comm: mount Not tainted 6.1.0-rc3-next #370 Call Trace: <TASK> dump_stack_lvl+0x73/0x9f print_report+0x25d/0x759 kasan_report+0xc0/0x120 __asan_load8+0x99/0x140 __list_add_valid+0x28/0x1a0 ext4_orphan_cleanup+0x564/0x9d0 [ext4] __ext4_fill_super+0x48e2/0x5300 [ext4] ext4_fill_super+0x19f/0x3a0 [ext4] get_tree_bdev+0x27b/0x450 ext4_get_tree+0x19/0x30 [ext4] vfs_get_tree+0x49/0x150 path_mount+0xaae/0x1350 do_mount+0xe2/0x110 __x64_sys_mount+0xf0/0x190 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd </TASK> [...] ================================================================== Above issue may happen as follows: ------------------------------------- ext4_fill_super ext4_orphan_cleanup --- loop1: assume last_orphan is 12 --- list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan) ext4_truncate --> return 0 ext4_inode_attach_jinode --> return -ENOMEM iput(inode) --> free inode<12> --- loop2: last_orphan is still 12 --- list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); // use inode<12> and trigger UAF To solve this issue, we need to propagate the return value of ext4_inode_attach_jinode() appropriately. Signed-off-by: Baokun Li <libaokun1@huawei.com> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20221102080633.1630225-1-libaokun1@huawei.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07fs: ext4: initialize fsdata in pagecache_write()Alexander Potapenko1-1/+1
commit 956510c0c7439e90b8103aaeaf4da92878c622f0 upstream. When aops->write_begin() does not initialize fsdata, KMSAN reports an error passing the latter to aops->write_end(). Fix this by unconditionally initializing fsdata. Cc: Eric Biggers <ebiggers@kernel.org> Fixes: c93d8f885809 ("ext4: add basic fs-verity support") Reported-by: syzbot+9767be679ef5016b6082@syzkaller.appspotmail.com Signed-off-by: Alexander Potapenko <glider@google.com> Reviewed-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20221121112134.407362-1-glider@google.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: correct inconsistent error msg in nojournal modeBaokun Li1-4/+5
commit 89481b5fa8c0640e62ba84c6020cee895f7ac643 upstream. When we used the journal_async_commit mounting option in nojournal mode, the kernel told me that "can't mount with journal_checksum", was very confusing. I find that when we mount with journal_async_commit, both the JOURNAL_ASYNC_COMMIT and EXPLICIT_JOURNAL_CHECKSUM flags are set. However, in the error branch, CHECKSUM is checked before ASYNC_COMMIT. As a result, the above inconsistency occurs, and the ASYNC_COMMIT branch becomes dead code that cannot be executed. Therefore, we exchange the positions of the two judgments to make the error msg more accurate. Signed-off-by: Baokun Li <libaokun1@huawei.com> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20221109074343.4184862-1-libaokun1@huawei.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: remove trailing newline from ext4_msg() messageLuís Henriques1-1/+1
commit 78742d4d056df7d2fad241c90185d281bf924844 upstream. The ext4_msg() function adds a new line to the message. Remove extra '\n' from call to ext4_msg() in ext4_orphan_cleanup(). Signed-off-by: Luís Henriques <lhenriques@suse.de> Link: https://lore.kernel.org/r/20221011155758.15287-1-lhenriques@suse.de Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: add inode table check in __ext4_get_inode_loc to aovid possible ↵Baokun Li1-1/+9
infinite loop commit eee22187b53611e173161e38f61de1c7ecbeb876 upstream. In do_writepages, if the value returned by ext4_writepages is "-ENOMEM" and "wbc->sync_mode == WB_SYNC_ALL", retry until the condition is not met. In __ext4_get_inode_loc, if the bh returned by sb_getblk is NULL, the function returns -ENOMEM. In __getblk_slow, if the return value of grow_buffers is less than 0, the function returns NULL. When the three processes are connected in series like the following stack, an infinite loop may occur: do_writepages <--- keep retrying ext4_writepages mpage_map_and_submit_extent mpage_map_one_extent ext4_map_blocks ext4_ext_map_blocks ext4_ext_handle_unwritten_extents ext4_ext_convert_to_initialized ext4_split_extent ext4_split_extent_at __ext4_ext_dirty __ext4_mark_inode_dirty ext4_reserve_inode_write ext4_get_inode_loc __ext4_get_inode_loc <--- return -ENOMEM sb_getblk __getblk_gfp __getblk_slow <--- return NULL grow_buffers grow_dev_page <--- return -ENXIO ret = (block < end_block) ? 1 : -ENXIO; In this issue, bg_inode_table_hi is overwritten as an incorrect value. As a result, `block < end_block` cannot be met in grow_dev_page. Therefore, __ext4_get_inode_loc always returns '-ENOMEM' and do_writepages keeps retrying. As a result, the writeback process is in the D state due to an infinite loop. Add a check on inode table block in the __ext4_get_inode_loc function by referring to ext4_read_inode_bitmap to avoid this infinite loop. Cc: stable@kernel.org Signed-off-by: Baokun Li <libaokun1@huawei.com> Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com> Link: https://lore.kernel.org/r/20220817132701.3015912-3-libaokun1@huawei.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-01-07ext4: silence the warning when evicting inode with dioread_nolockZhang Yi1-5/+5
commit bc12ac98ea2e1b70adc6478c8b473a0003b659d3 upstream. When evicting an inode with default dioread_nolock, it could be raced by the unwritten extents converting kworker after writeback some new allocated dirty blocks. It convert unwritten extents to written, the extents could be merged to upper level and free extent blocks, so it could mark the inode dirty again even this inode has been marked I_FREEING. But the inode->i_io_list check and warning in ext4_evict_inode() missing this corner case. Fortunately, ext4_evict_inode() will wait all extents converting finished before this check, so it will not lead to inode use-after-free problem, every thing is OK besides this warning. The WARN_ON_ONCE was originally designed for finding inode use-after-free issues in advance, but if we add current dioread_nolock case in, it will become not quite useful, so fix this warning by just remove this check. ====== WARNING: CPU: 7 PID: 1092 at fs/ext4/inode.c:227 ext4_evict_inode+0x875/0xc60 ... RIP: 0010:ext4_evict_inode+0x875/0xc60 ... Call Trace: <TASK> evict+0x11c/0x2b0 iput+0x236/0x3a0 do_unlinkat+0x1b4/0x490 __x64_sys_unlinkat+0x4c/0xb0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x46/0xb0 RIP: 0033:0x7fa933c1115b ====== rm kworker ext4_end_io_end() vfs_unlink() ext4_unlink() ext4_convert_unwritten_io_end_vec() ext4_convert_unwritten_extents() ext4_map_blocks() ext4_ext_map_blocks() ext4_ext_try_to_merge_up() __mark_inode_dirty() check !I_FREEING locked_inode_to_wb_and_lock_list() iput() iput_final() evict() ext4_evict_inode() truncate_inode_pages_final() //wait release io_end inode_io_list_move_locked() ext4_release_io_end() trigger WARN_ON_ONCE() Cc: stable@kernel.org Fixes: ceff86fddae8 ("ext4: Avoid freeing inodes on dirty list") Signed-off-by: Zhang Yi <yi.zhang@huawei.com> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20220629112647.4141034-1-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2022-11-07ext4: fix use-after-free in ext4_ext_shift_extentsBaokun Li1-5/+13
If the starting position of our insert range happens to be in the hole between the two ext4_extent_idx, because the lblk of the ext4_extent in the previous ext4_extent_idx is always less than the start, which leads to the "extent" variable access across the boundary, the following UAF is triggered: ================================================================== BUG: KASAN: use-after-free in ext4_ext_shift_extents+0x257/0x790 Read of size 4 at addr ffff88819807a008 by task fallocate/8010 CPU: 3 PID: 8010 Comm: fallocate Tainted: G E 5.10.0+ #492 Call Trace: dump_stack+0x7d/0xa3 print_address_description.constprop.0+0x1e/0x220 kasan_report.cold+0x67/0x7f ext4_ext_shift_extents+0x257/0x790 ext4_insert_range+0x5b6/0x700 ext4_fallocate+0x39e/0x3d0 vfs_fallocate+0x26f/0x470 ksys_fallocate+0x3a/0x70 __x64_sys_fallocate+0x4f/0x60 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 ================================================================== For right shifts, we can divide them into the following situations: 1. When the first ee_block of ext4_extent_idx is greater than or equal to start, make right shifts directly from the first ee_block. 1) If it is greater than start, we need to continue searching in the previous ext4_extent_idx. 2) If it is equal to start, we can exit the loop (iterator=NULL). 2. When the first ee_block of ext4_extent_idx is less than start, then traverse from the last extent to find the first extent whose ee_block is less than start. 1) If extent is still the last extent after traversal, it means that the last ee_block of ext4_extent_idx is less than start, that is, start is located in the hole between idx and (idx+1), so we can exit the loop directly (break) without right shifts. 2) Otherwise, make right shifts at the corresponding position of the found extent, and then exit the loop (iterator=NULL). Fixes: 331573febb6a ("ext4: Add support FALLOC_FL_INSERT_RANGE for fallocate") Cc: stable@vger.kernel.org # v4.2+ Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Baokun Li <libaokun1@huawei.com> Link: https://lore.kernel.org/r/20220922120434.1294789-1-libaokun1@huawei.com Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2022-11-06Merge tag 'ext4_for_linus_stable' of ↵Linus Torvalds6-7/+21
git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4 Pull ext4 fixes from Ted Ts'o: "Fix a number of bugs, including some regressions, the most serious of which was one which would cause online resizes to fail with file systems with metadata checksums enabled. Also fix a warning caused by the newly added fortify string checker, plus some bugs that were found using fuzzed file systems" * tag 'ext4_for_linus_stable' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: ext4: fix fortify warning in fs/ext4/fast_commit.c:1551 ext4: fix wrong return err in ext4_load_and_init_journal() ext4: fix warning in 'ext4_da_release_space' ext4: fix BUG_ON() when directory entry has invalid rec_len ext4: update the backup superblock's at the end of the online resize
2022-11-06ext4: fix fortify warning in fs/ext4/fast_commit.c:1551Theodore Ts'o1-2/+3
With the new fortify string system, rework the memcpy to avoid this warning: memcpy: detected field-spanning write (size 60) of single field "&raw_inode->i_generation" at fs/ext4/fast_commit.c:1551 (size 4) Cc: stable@kernel.org Fixes: 54d9469bc515 ("fortify: Add run-time WARN for cross-field memcpy()") Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2022-11-06ext4: fix wrong return err in ext4_load_and_init_journal()Jason Yan1-1/+1
The return value is wrong in ext4_load_and_init_journal(). The local variable 'err' need to be initialized before goto out. The original code in __ext4_fill_super() is fine because it has two return values 'ret' and 'err' and 'ret' is initialized as -EINVAL. After we factor out ext4_load_and_init_journal(), this code is broken. So fix it by directly returning -EINVAL in the error handler path. Cc: stable@kernel.org Fixes: 9c1dd22d7422 ("ext4: factor out ext4_load_and_init_journal()") Signed-off-by: Jason Yan <yanaijie@huawei.com> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20221025040206.3134773-1-yanaijie@huawei.com Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2022-11-06ext4: fix warning in 'ext4_da_release_space'Ye Bin1-1/+2
Syzkaller report issue as follows: EXT4-fs (loop0): Free/Dirty block details EXT4-fs (loop0): free_blocks=0 EXT4-fs (loop0): dirty_blocks=0 EXT4-fs (loop0): Block reservation details EXT4-fs (loop0): i_reserved_data_blocks=0 EXT4-fs warning (device loop0): ext4_da_release_space:1527: ext4_da_release_space: ino 18, to_free 1 with only 0 reserved data blocks ------------[ cut here ]------------ WARNING: CPU: 0 PID: 92 at fs/ext4/inode.c:1528 ext4_da_release_space+0x25e/0x370 fs/ext4/inode.c:1524 Modules linked in: CPU: 0 PID: 92 Comm: kworker/u4:4 Not tainted 6.0.0-syzkaller-09423-g493ffd6605b2 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/22/2022 Workqueue: writeback wb_workfn (flush-7:0) RIP: 0010:ext4_da_release_space+0x25e/0x370 fs/ext4/inode.c:1528 RSP: 0018:ffffc900015f6c90 EFLAGS: 00010296 RAX: 42215896cd52ea00 RBX: 0000000000000000 RCX: 42215896cd52ea00 RDX: 0000000000000000 RSI: 0000000080000001 RDI: 0000000000000000 RBP: 1ffff1100e907d96 R08: ffffffff816aa79d R09: fffff520002bece5 R10: fffff520002bece5 R11: 1ffff920002bece4 R12: ffff888021fd2000 R13: ffff88807483ecb0 R14: 0000000000000001 R15: ffff88807483e740 FS: 0000000000000000(0000) GS:ffff8880b9a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005555569ba628 CR3: 000000000c88e000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <TASK> ext4_es_remove_extent+0x1ab/0x260 fs/ext4/extents_status.c:1461 mpage_release_unused_pages+0x24d/0xef0 fs/ext4/inode.c:1589 ext4_writepages+0x12eb/0x3be0 fs/ext4/inode.c:2852 do_writepages+0x3c3/0x680 mm/page-writeback.c:2469 __writeback_single_inode+0xd1/0x670 fs/fs-writeback.c:1587 writeback_sb_inodes+0xb3b/0x18f0 fs/fs-writeback.c:1870 wb_writeback+0x41f/0x7b0 fs/fs-writeback.c:2044 wb_do_writeback fs/fs-writeback.c:2187 [inline] wb_workfn+0x3cb/0xef0 fs/fs-writeback.c:2227 process_one_work+0x877/0xdb0 kernel/workqueue.c:2289 worker_thread+0xb14/0x1330 kernel/workqueue.c:2436 kthread+0x266/0x300 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:306 </TASK> Above issue may happens as follows: ext4_da_write_begin ext4_create_inline_data ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA); __ext4_ioctl ext4_ext_migrate -> will lead to eh->eh_entries not zero, and set extent flag ext4_da_write_begin ext4_da_convert_inline_data_to_extent ext4_da_write_inline_data_begin ext4_da_map_blocks ext4_insert_delayed_block if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) if (!ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk)) ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk)); -> will return 1 allocated = true; ext4_es_insert_delayed_block(inode, lblk, allocated); ext4_writepages mpage_map_and_submit_extent(handle, &mpd, &give_up_on_write); -> return -ENOSPC mpage_release_unused_pages(&mpd, give_up_on_write); -> give_up_on_write == 1 ext4_es_remove_extent ext4_da_release_space(inode, reserved); if (unlikely(to_free > ei->i_reserved_data_blocks)) -> to_free == 1 but ei->i_reserved_data_blocks == 0 -> then trigger warning as above To solve above issue, forbid inode do migrate which has inline data. Cc: stable@kernel.org Reported-by: syzbot+c740bb18df70ad00952e@syzkaller.appspotmail.com Signed-off-by: Ye Bin <yebin10@huawei.com> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20221018022701.683489-1-yebin10@huawei.com Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2022-11-06ext4: fix BUG_ON() when directory entry has invalid rec_lenLuís Henriques1-1/+9
The rec_len field in the directory entry has to be a multiple of 4. A corrupted filesystem image can be used to hit a BUG() in ext4_rec_len_to_disk(), called from make_indexed_dir(). ------------[ cut here ]------------ kernel BUG at fs/ext4/ext4.h:2413! ... RIP: 0010:make_indexed_dir+0x53f/0x5f0 ... Call Trace: <TASK> ? add_dirent_to_buf+0x1b2/0x200 ext4_add_entry+0x36e/0x480 ext4_add_nondir+0x2b/0xc0 ext4_create+0x163/0x200 path_openat+0x635/0xe90 do_filp_open+0xb4/0x160 ? __create_object.isra.0+0x1de/0x3b0 ? _raw_spin_unlock+0x12/0x30 do_sys_openat2+0x91/0x150 __x64_sys_open+0x6c/0xa0 do_syscall_64+0x3c/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 The fix simply adds a call to ext4_check_dir_entry() to validate the directory entry, returning -EFSCORRUPTED if the entry is invalid. CC: stable@kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=216540 Signed-off-by: Luís Henriques <lhenriques@suse.de> Link: https://lore.kernel.org/r/20221012131330.32456-1-lhenriques@suse.de Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2022-10-28fs/ext4/super.c: remove unused `deprecated_msg'Andrew Morton1-4/+0
fs/ext4/super.c:1744:19: warning: 'deprecated_msg' defined but not used [-Wunused-const-variable=] Reported-by: kernel test robot <lkp@intel.com> Cc: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-10-28ext4: update the backup superblock's at the end of the online resizeTheodore Ts'o2-2/+6
When expanding a file system using online resize, various fields in the superblock (e.g., s_blocks_count, s_inodes_count, etc.) change. To update the backup superblocks, the online resize uses the function update_backups() in fs/ext4/resize.c. This function was not updating the checksum field in the backup superblocks. This wasn't a big deal previously, because e2fsck didn't care about the checksum field in the backup superblock. (And indeed, update_backups() goes all the way back to the ext3 days, well before we had support for metadata checksums.) However, there is an alternate, more general way of updating superblock fields, ext4_update_primary_sb() in fs/ext4/ioctl.c. This function does check the checksum of the backup superblock, and if it doesn't match will mark the file system as corrupted. That was clearly not the intent, so avoid to aborting the resize when a bad superblock is found. In addition, teach update_backups() to properly update the checksum in the backup superblocks. We will eventually want to unify updapte_backups() with the infrasture in ext4_update_primary_sb(), but that's for another day. Note: The problem has been around for a while; it just didn't really matter until ext4_update_primary_sb() was added by commit bbc605cdb1e1 ("ext4: implement support for get/set fs label"). And it became trivially easy to reproduce after commit 827891a38acc ("ext4: update the s_overhead_clusters in the backup sb's when resizing") in v6.0. Cc: stable@kernel.org # 5.17+ Fixes: bbc605cdb1e1 ("ext4: implement support for get/set fs label") Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2022-10-17Merge tag 'random-6.1-rc1-for-linus' of ↵Linus Torvalds4-11/+9
git://git.kernel.org/pub/scm/linux/kernel/git/crng/random Pull more random number generator updates from Jason Donenfeld: "This time with some large scale treewide cleanups. The intent of this pull is to clean up the way callers fetch random integers. The current rules for doing this right are: - If you want a secure or an insecure random u64, use get_random_u64() - If you want a secure or an insecure random u32, use get_random_u32() The old function prandom_u32() has been deprecated for a while now and is just a wrapper around get_random_u32(). Same for get_random_int(). - If you want a secure or an insecure random u16, use get_random_u16() - If you want a secure or an insecure random u8, use get_random_u8() - If you want secure or insecure random bytes, use get_random_bytes(). The old function prandom_bytes() has been deprecated for a while now and has long been a wrapper around get_random_bytes() - If you want a non-uniform random u32, u16, or u8 bounded by a certain open interval maximum, use prandom_u32_max() I say "non-uniform", because it doesn't do any rejection sampling or divisions. Hence, it stays within the prandom_*() namespace, not the get_random_*() namespace. I'm currently investigating a "uniform" function for 6.2. We'll see what comes of that. By applying these rules uniformly, we get several benefits: - By using prandom_u32_max() with an upper-bound that the compiler can prove at compile-time is ≤65536 or ≤256, internally get_random_u16() or get_random_u8() is used, which wastes fewer batched random bytes, and hence has higher throughput. - By using prandom_u32_max() instead of %, when the upper-bound is not a constant, division is still avoided, because prandom_u32_max() uses a faster multiplication-based trick instead. - By using get_random_u16() or get_random_u8() in cases where the return value is intended to indeed be a u16 or a u8, we waste fewer batched random bytes, and hence have higher throughput. This series was originally done by hand while I was on an airplane without Internet. Later, Kees and I worked on retroactively figuring out what could be done with Coccinelle and what had to be done manually, and then we split things up based on that. So while this touches a lot of files, the actual amount of code that's hand fiddled is comfortably small" * tag 'random-6.1-rc1-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/crng/random: prandom: remove unused functions treewide: use get_random_bytes() when possible treewide: use get_random_u32() when possible treewide: use get_random_{u8,u16}() when possible, part 2 treewide: use get_random_{u8,u16}() when possible, part 1 treewide: use prandom_u32_max() when possible, part 2 treewide: use prandom_u32_max() when possible, part 1
2022-10-14Merge tag 'mm-stable-2022-10-13' of ↵Linus Torvalds1-1/+2
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Pull more MM updates from Andrew Morton: - fix a race which causes page refcounting errors in ZONE_DEVICE pages (Alistair Popple) - fix userfaultfd test harness instability (Peter Xu) - various other patches in MM, mainly fixes * tag 'mm-stable-2022-10-13' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (29 commits) highmem: fix kmap_to_page() for kmap_local_page() addresses mm/page_alloc: fix incorrect PGFREE and PGALLOC for high-order page mm/selftest: uffd: explain the write missing fault check mm/hugetlb: use hugetlb_pte_stable in migration race check mm/hugetlb: fix race condition of uffd missing/minor handling zram: always expose rw_page LoongArch: update local TLB if PTE entry exists mm: use update_mmu_tlb() on the second thread kasan: fix array-bounds warnings in tests hmm-tests: add test for migrate_device_range() nouveau/dmem: evict device private memory during release nouveau/dmem: refactor nouveau_dmem_fault_copy_one() mm/migrate_device.c: add migrate_device_range() mm/migrate_device.c: refactor migrate_vma and migrate_deivce_coherent_page() mm/memremap.c: take a pgmap reference on page allocation mm: free device private pages have zero refcount mm/memory.c: fix race when faulting a device private page mm/damon: use damon_sz_region() in appropriate place mm/damon: move sz_damon_region to damon_sz_region lib/test_meminit: add checks for the allocation functions ...
2022-10-13ext4,f2fs: fix readahead of verity dataMatthew Wilcox (Oracle)1-1/+2
The recent change of page_cache_ra_unbounded() arguments was buggy in the two callers, causing us to readahead the wrong pages. Move the definition of ractl down to after the index is set correctly. This affected performance on configurations that use fs-verity. Link: https://lkml.kernel.org/r/20221012193419.1453558-1-willy@infradead.org Fixes: 73bb49da50cd ("mm/readahead: make page_cache_ra_unbounded take a readahead_control") Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reported-by: Jintao Yin <nicememory@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-10-12treewide: use get_random_u32() when possibleJason A. Donenfeld3-4/+4
The prandom_u32() function has been a deprecated inline wrapper around get_random_u32() for several releases now, and compiles down to the exact same code. Replace the deprecated wrapper with a direct call to the real function. The same also applies to get_random_int(), which is just a wrapper around get_random_u32(). This was done as a basic find and replace. Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Reviewed-by: Kees Cook <keescook@chromium.org> Reviewed-by: Yury Norov <yury.norov@gmail.com> Reviewed-by: Jan Kara <jack@suse.cz> # for ext4 Acked-by: Toke Høiland-Jørgensen <toke@toke.dk> # for sch_cake Acked-by: Chuck Lever <chuck.lever@oracle.com> # for nfsd Acked-by: Jakub Kicinski <kuba@kernel.org> Acked-by: Mika Westerberg <mika.westerberg@linux.intel.com> # for thunderbolt Acked-by: Darrick J. Wong <djwong@kernel.org> # for xfs Acked-by: Helge Deller <deller@gmx.de> # for parisc Acked-by: Heiko Carstens <hca@linux.ibm.com> # for s390 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
2022-10-12treewide: use prandom_u32_max() when possible, part 2Jason A. Donenfeld1-3/+2
Rather than incurring a division or requesting too many random bytes for the given range, use the prandom_u32_max() function, which only takes the minimum required bytes from the RNG and avoids divisions. This was done by hand, covering things that coccinelle could not do on its own. Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Reviewed-by: Kees Cook <keescook@chromium.org> Reviewed-by: Yury Norov <yury.norov@gmail.com> Reviewed-by: Jan Kara <jack@suse.cz> # for ext2, ext4, and sbitmap Acked-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
2022-10-12treewide: use prandom_u32_max() when possible, part 1Jason A. Donenfeld1-4/+3
Rather than incurring a division or requesting too many random bytes for the given range, use the prandom_u32_max() function, which only takes the minimum required bytes from the RNG and avoids divisions. This was done mechanically with this coccinelle script: @basic@ expression E; type T; identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32"; typedef u64; @@ ( - ((T)get_random_u32() % (E)) + prandom_u32_max(E) | - ((T)get_random_u32() & ((E) - 1)) + prandom_u32_max(E * XXX_MAKE_SURE_E_IS_POW2) | - ((u64)(E) * get_random_u32() >> 32) + prandom_u32_max(E) | - ((T)get_random_u32() & ~PAGE_MASK) + prandom_u32_max(PAGE_SIZE) ) @multi_line@ identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32"; identifier RAND; expression E; @@ - RAND = get_random_u32(); ... when != RAND - RAND %= (E); + RAND = prandom_u32_max(E); // Find a potential literal @literal_mask@ expression LITERAL; type T; identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32"; position p; @@ ((T)get_random_u32()@p & (LITERAL)) // Add one to the literal. @script:python add_one@ literal << literal_mask.LITERAL; RESULT; @@ value = None if literal.startswith('0x'): value = int(literal, 16) elif literal[0] in '123456789': value = int(literal, 10) if value is None: print("I don't know how to handle %s" % (literal)) cocci.include_match(False) elif value == 2**32 - 1 or value == 2**31 - 1 or value == 2**24 - 1 or value == 2**16 - 1 or value == 2**8 - 1: print("Skipping 0x%x for cleanup elsewhere" % (value)) cocci.include_match(False) elif value & (value + 1) != 0: print("Skipping 0x%x because it's not a power of two minus one" % (value)) cocci.include_match(False) elif literal.startswith('0x'): coccinelle.RESULT = cocci.make_expr("0x%x" % (value + 1)) else: coccinelle.RESULT = cocci.make_expr("%d" % (value + 1)) // Replace the literal mask with the calculated result. @plus_one@ expression literal_mask.LITERAL; position literal_mask.p; expression add_one.RESULT; identifier FUNC; @@ - (FUNC()@p & (LITERAL)) + prandom_u32_max(RESULT) @collapse_ret@ type T; identifier VAR; expression E; @@ { - T VAR; - VAR = (E); - return VAR; + return E; } @drop_var@ type T; identifier VAR; @@ { - T VAR; ... when != VAR } Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Reviewed-by: Kees Cook <keescook@chromium.org> Reviewed-by: Yury Norov <yury.norov@gmail.com> Reviewed-by: KP Singh <kpsingh@kernel.org> Reviewed-by: Jan Kara <jack@suse.cz> # for ext4 and sbitmap Reviewed-by: Christoph Böhmwalder <christoph.boehmwalder@linbit.com> # for drbd Acked-by: Jakub Kicinski <kuba@kernel.org> Acked-by: Heiko Carstens <hca@linux.ibm.com> # for s390 Acked-by: Ulf Hansson <ulf.hansson@linaro.org> # for mmc Acked-by: Darrick J. Wong <djwong@kernel.org> # for xfs Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>