From ddd65e19c60140673ea9f7249af0a672f1820623 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 23 Mar 2024 17:11:19 +0100 Subject: block: handle BLK_OPEN_RESTRICT_WRITES correctly Last kernel release we introduce CONFIG_BLK_DEV_WRITE_MOUNTED. By default this option is set. When it is set the long-standing behavior of being able to write to mounted block devices is enabled. But in order to guard against unintended corruption by writing to the block device buffer cache CONFIG_BLK_DEV_WRITE_MOUNTED can be turned off. In that case it isn't possible to write to mounted block devices anymore. A filesystem may open its block devices with BLK_OPEN_RESTRICT_WRITES which disallows concurrent BLK_OPEN_WRITE access. When we still had the bdev handle around we could recognize BLK_OPEN_RESTRICT_WRITES because the mode was passed around. Since we managed to get rid of the bdev handle we changed that logic to recognize BLK_OPEN_RESTRICT_WRITES based on whether the file was opened writable and writes to that block device are blocked. That logic doesn't work because we do allow BLK_OPEN_RESTRICT_WRITES to be specified without BLK_OPEN_WRITE. Fix the detection logic and use an FMODE_* bit. We could've also abused O_EXCL as an indicator that BLK_OPEN_RESTRICT_WRITES has been requested. For userspace open paths O_EXCL will never be retained but for internal opens where we open files that are never installed into a file descriptor table this is fine. But it would be a gamble that this doesn't cause bugs. Note that BLK_OPEN_RESTRICT_WRITES is an internal only flag that cannot directly be raised by userspace. It is implicitly raised during mounting. Passes xftests and blktests with CONFIG_BLK_DEV_WRITE_MOUNTED set and unset. Link: https://lore.kernel.org/r/ZfyyEwu9Uq5Pgb94@casper.infradead.org Link: https://lore.kernel.org/r/20240323-zielbereich-mittragen-6fdf14876c3e@brauner Fixes: 321de651fa56 ("block: don't rely on BLK_OPEN_RESTRICT_WRITES when yielding write access") Reviewed-by: Yu Kuai Reviewed-by: Jan Kara Reported-by: Matthew Wilcox Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 00fc429b0af0..8dfd53b52744 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -121,6 +121,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_PWRITE ((__force fmode_t)0x10) /* File is opened for execution with sys_execve / sys_uselib */ #define FMODE_EXEC ((__force fmode_t)0x20) +/* File writes are restricted (block device specific) */ +#define FMODE_WRITE_RESTRICTED ((__force fmode_t)0x40) /* 32bit hashes as llseek() offset (for directories) */ #define FMODE_32BITHASH ((__force fmode_t)0x200) /* 64bit hashes as llseek() offset (for directories) */ -- cgit v1.2.3 From 07a3b8d0bf726a1e49b050bbc6bd72f031e505fe Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 17 Mar 2024 20:41:51 +0200 Subject: fsnotify: lazy attach fsnotify_sb_info state to sb Define a container struct fsnotify_sb_info to hold per-sb state, including the reference to sb marks connector. Allocate the fsnotify_sb_info state before attaching connector to any object on the sb and free it only when killing sb. This state is going to be used for storing per priority watched objects counters. Suggested-by: Jan Kara Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Message-Id: <20240317184154.1200192-8-amir73il@gmail.com> --- fs/notify/fsnotify.c | 16 +++++++++++++--- fs/notify/fsnotify.h | 9 ++++++++- fs/notify/mark.c | 32 +++++++++++++++++++++++++++++++- include/linux/fs.h | 8 ++++---- include/linux/fsnotify_backend.h | 17 +++++++++++++++++ 5 files changed, 73 insertions(+), 9 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 503e7c75e777..fb3f36bc6ea9 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -89,11 +89,18 @@ static void fsnotify_unmount_inodes(struct super_block *sb) void fsnotify_sb_delete(struct super_block *sb) { + struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); + + /* Were any marks ever added to any object on this sb? */ + if (!sbinfo) + return; + fsnotify_unmount_inodes(sb); fsnotify_clear_marks_by_sb(sb); /* Wait for outstanding object references from connectors */ wait_var_event(fsnotify_sb_watched_objects(sb), !atomic_long_read(fsnotify_sb_watched_objects(sb))); + kfree(sbinfo); } /* @@ -489,6 +496,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, { const struct path *path = fsnotify_data_path(data, data_type); struct super_block *sb = fsnotify_data_sb(data, data_type); + struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); struct fsnotify_iter_info iter_info = {}; struct mount *mnt = NULL; struct inode *inode2 = NULL; @@ -525,7 +533,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, * SRCU because we have no references to any objects and do not * need SRCU to keep them "alive". */ - if (!sb->s_fsnotify_marks && + if ((!sbinfo || !sbinfo->sb_marks) && (!mnt || !mnt->mnt_fsnotify_marks) && (!inode || !inode->i_fsnotify_marks) && (!inode2 || !inode2->i_fsnotify_marks)) @@ -552,8 +560,10 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); - iter_info.marks[FSNOTIFY_ITER_TYPE_SB] = - fsnotify_first_mark(&sb->s_fsnotify_marks); + if (sbinfo) { + iter_info.marks[FSNOTIFY_ITER_TYPE_SB] = + fsnotify_first_mark(&sbinfo->sb_marks); + } if (mnt) { iter_info.marks[FSNOTIFY_ITER_TYPE_VFSMOUNT] = fsnotify_first_mark(&mnt->mnt_fsnotify_marks); diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index b1fec7cd8ee4..2d059f789ee3 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -54,6 +54,13 @@ static inline struct super_block *fsnotify_connector_sb( return fsnotify_object_sb(conn->obj, conn->type); } +static inline fsnotify_connp_t *fsnotify_sb_marks(struct super_block *sb) +{ + struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); + + return sbinfo ? &sbinfo->sb_marks : NULL; +} + /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); @@ -79,7 +86,7 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) /* run the list of all marks associated with sb and destroy them */ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) { - fsnotify_destroy_marks(&sb->s_fsnotify_marks); + fsnotify_destroy_marks(fsnotify_sb_marks(sb)); } /* diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 647c49c20467..b2f5d8c9cce1 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -106,7 +106,7 @@ static fsnotify_connp_t *fsnotify_object_connp(void *obj, case FSNOTIFY_OBJ_TYPE_VFSMOUNT: return &real_mount(obj)->mnt_fsnotify_marks; case FSNOTIFY_OBJ_TYPE_SB: - return &((struct super_block *)obj)->s_fsnotify_marks; + return fsnotify_sb_marks(obj); default: return NULL; } @@ -569,6 +569,26 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) return -1; } +static int fsnotify_attach_info_to_sb(struct super_block *sb) +{ + struct fsnotify_sb_info *sbinfo; + + /* sb info is freed on fsnotify_sb_delete() */ + sbinfo = kzalloc(sizeof(*sbinfo), GFP_KERNEL); + if (!sbinfo) + return -ENOMEM; + + /* + * cmpxchg() provides the barrier so that callers of fsnotify_sb_info() + * will observe an initialized structure + */ + if (cmpxchg(&sb->s_fsnotify_info, NULL, sbinfo)) { + /* Someone else created sbinfo for us */ + kfree(sbinfo); + } + return 0; +} + static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, void *obj, unsigned int obj_type) { @@ -640,6 +660,16 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj, if (WARN_ON(!fsnotify_valid_obj_type(obj_type))) return -EINVAL; + /* + * Attach the sb info before attaching a connector to any object on sb. + * The sb info will remain attached as long as sb lives. + */ + if (!fsnotify_sb_info(sb)) { + err = fsnotify_attach_info_to_sb(sb); + if (err) + return err; + } + connp = fsnotify_object_connp(obj, obj_type); restart: spin_lock(&mark->lock); diff --git a/include/linux/fs.h b/include/linux/fs.h index 00fc429b0af0..7f40b592f711 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -73,6 +73,8 @@ struct fscrypt_inode_info; struct fscrypt_operations; struct fsverity_info; struct fsverity_operations; +struct fsnotify_mark_connector; +struct fsnotify_sb_info; struct fs_context; struct fs_parameter_spec; struct fileattr; @@ -620,8 +622,6 @@ is_uncached_acl(struct posix_acl *acl) #define IOP_XATTR 0x0008 #define IOP_DEFAULT_READLINK 0x0010 -struct fsnotify_mark_connector; - /* * Keep mostly read-only and often accessed (especially for * the RCU path lookup and 'stat' data) fields at the beginning @@ -1249,7 +1249,7 @@ struct super_block { /* * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and - * s_fsnotify_marks together for cache efficiency. They are frequently + * s_fsnotify_info together for cache efficiency. They are frequently * accessed and rarely modified. */ void *s_fs_info; /* Filesystem private info */ @@ -1261,7 +1261,7 @@ struct super_block { time64_t s_time_max; #ifdef CONFIG_FSNOTIFY __u32 s_fsnotify_mask; - struct fsnotify_mark_connector __rcu *s_fsnotify_marks; + struct fsnotify_sb_info *s_fsnotify_info; #endif /* diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 83004d9e07a3..c9f2b2f6b493 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -477,6 +477,23 @@ struct fsnotify_mark_connector { struct hlist_head list; }; +/* + * Container for per-sb fsnotify state (sb marks and more). + * Attached lazily on first marked object on the sb and freed when killing sb. + */ +struct fsnotify_sb_info { + struct fsnotify_mark_connector __rcu *sb_marks; +}; + +static inline struct fsnotify_sb_info *fsnotify_sb_info(struct super_block *sb) +{ +#ifdef CONFIG_FSNOTIFY + return READ_ONCE(sb->s_fsnotify_info); +#else + return NULL; +#endif +} + static inline atomic_long_t *fsnotify_sb_watched_objects(struct super_block *sb) { return &sb->s_fsnotify_connectors; -- cgit v1.2.3 From cb5d4f48c10445c97a22af0bd8b9cf0ed6cc8036 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 17 Mar 2024 20:41:52 +0200 Subject: fsnotify: move s_fsnotify_connectors into fsnotify_sb_info Move the s_fsnotify_connectors counter into the per-sb fsnotify state. Suggested-by: Christian Brauner Signed-off-by: Amir Goldstein Reviewed-by: Christian Brauner Signed-off-by: Jan Kara Message-Id: <20240317184154.1200192-9-amir73il@gmail.com> --- include/linux/fs.h | 6 ------ include/linux/fsnotify.h | 8 +++++++- include/linux/fsnotify_backend.h | 7 ++++++- 3 files changed, 13 insertions(+), 8 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7f40b592f711..c36c2f8fdbe3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1302,12 +1302,6 @@ struct super_block { /* Number of inodes with nlink == 0 but still referenced */ atomic_long_t s_remove_count; - /* - * Number of inode/mount/sb objects that are being watched, note that - * inodes objects are currently double-accounted. - */ - atomic_long_t s_fsnotify_connectors; - /* Read-only state of the superblock is being changed */ int s_readonly_remount; diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index e470bb67c9a3..48dc65702415 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -20,7 +20,13 @@ /* Are there any inode/mount/sb objects that are being watched at all? */ static inline bool fsnotify_sb_has_watchers(struct super_block *sb) { - return atomic_long_read(fsnotify_sb_watched_objects(sb)); + struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); + + /* Were any marks ever added to any object on this sb? */ + if (!sbinfo) + return false; + + return atomic_long_read(&sbinfo->watched_objects); } /* diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index c9f2b2f6b493..ec592aeadfa3 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -483,6 +483,11 @@ struct fsnotify_mark_connector { */ struct fsnotify_sb_info { struct fsnotify_mark_connector __rcu *sb_marks; + /* + * Number of inode/mount/sb objects that are being watched in this sb. + * Note that inodes objects are currently double-accounted. + */ + atomic_long_t watched_objects; }; static inline struct fsnotify_sb_info *fsnotify_sb_info(struct super_block *sb) @@ -496,7 +501,7 @@ static inline struct fsnotify_sb_info *fsnotify_sb_info(struct super_block *sb) static inline atomic_long_t *fsnotify_sb_watched_objects(struct super_block *sb) { - return &sb->s_fsnotify_connectors; + return &fsnotify_sb_info(sb)->watched_objects; } /* -- cgit v1.2.3 From 68d6f4f3fbd9b1baae53e7cf33fb3362b5a21494 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 25 Mar 2024 19:34:01 -0600 Subject: fs: Annotate struct file_handle with __counted_by() and use struct_size() Prepare for the coming implementation by GCC and Clang of the __counted_by attribute. Flexible array members annotated with __counted_by can have their accesses bounds-checked at run-time via CONFIG_UBSAN_BOUNDS (for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family functions). While there, use struct_size() helper, instead of the open-coded version. [brauner@kernel.org: contains a fix by Edward for an OOB access] Reported-by: syzbot+4139435cb1b34cf759c2@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Link: https://lore.kernel.org/r/tencent_A7845DD769577306D813742365E976E3A205@qq.com Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/ZgImCXTdGDTeBvSS@neat Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fhandle.c | 6 +++--- include/linux/fs.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/fhandle.c b/fs/fhandle.c index 57a12614addf..8a7f86c2139a 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -36,7 +36,7 @@ static long do_sys_name_to_handle(const struct path *path, if (f_handle.handle_bytes > MAX_HANDLE_SZ) return -EINVAL; - handle = kzalloc(sizeof(struct file_handle) + f_handle.handle_bytes, + handle = kzalloc(struct_size(handle, f_handle, f_handle.handle_bytes), GFP_KERNEL); if (!handle) return -ENOMEM; @@ -71,7 +71,7 @@ static long do_sys_name_to_handle(const struct path *path, /* copy the mount id */ if (put_user(real_mount(path->mnt)->mnt_id, mnt_id) || copy_to_user(ufh, handle, - sizeof(struct file_handle) + handle_bytes)) + struct_size(handle, f_handle, handle_bytes))) retval = -EFAULT; kfree(handle); return retval; @@ -192,7 +192,7 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, retval = -EINVAL; goto out_err; } - handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes, + handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes), GFP_KERNEL); if (!handle) { retval = -ENOMEM; diff --git a/include/linux/fs.h b/include/linux/fs.h index 00fc429b0af0..914245085597 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1033,7 +1033,7 @@ struct file_handle { __u32 handle_bytes; int handle_type; /* file identifier */ - unsigned char f_handle[]; + unsigned char f_handle[] __counted_by(handle_bytes); }; static inline struct file *get_file(struct file *f) -- cgit v1.2.3 From 210a03c9d51aa0e6e6f06980116e3256da8d4c48 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 28 Mar 2024 13:27:24 +0100 Subject: fs: claw back a few FMODE_* bits There's a bunch of flags that are purely based on what the file operations support while also never being conditionally set or unset. IOW, they're not subject to change for individual files. Imho, such flags don't need to live in f_mode they might as well live in the fops structs itself. And the fops struct already has that lonely mmap_supported_flags member. We might as well turn that into a generic fop_flags member and move a few flags from FMODE_* space into FOP_* space. That gets us four FMODE_* bits back and the ability for new static flags that are about file ops to not have to live in FMODE_* space but in their own FOP_* space. It's not the most beautiful thing ever but it gets the job done. Yes, there'll be an additional pointer chase but hopefully that won't matter for these flags. I suspect there's a few more we can move into there and that we can also redirect a bunch of new flag suggestions that follow this pattern into the fop_flags field instead of f_mode. Link: https://lore.kernel.org/r/20240328-gewendet-spargel-aa60a030ef74@brauner Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- block/bdev.c | 2 +- block/fops.c | 1 + drivers/dax/device.c | 2 +- fs/btrfs/file.c | 4 ++-- fs/ext4/file.c | 6 +++--- fs/f2fs/file.c | 3 ++- fs/read_write.c | 2 +- fs/xfs/xfs_file.c | 8 +++++--- include/linux/fs.h | 22 ++++++++++++---------- io_uring/io_uring.c | 2 +- io_uring/rw.c | 9 +++++---- mm/mmap.c | 4 +++- 12 files changed, 37 insertions(+), 28 deletions(-) (limited to 'include/linux/fs.h') diff --git a/block/bdev.c b/block/bdev.c index 7a5f611c3d2e..1322dfe32c5d 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -904,7 +904,7 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, disk_unblock_events(disk); bdev_file->f_flags |= O_LARGEFILE; - bdev_file->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT; + bdev_file->f_mode |= FMODE_CAN_ODIRECT; if (bdev_nowait(bdev)) bdev_file->f_mode |= FMODE_NOWAIT; bdev_file->f_mapping = bdev->bd_inode->i_mapping; diff --git a/block/fops.c b/block/fops.c index 679d9b752fe8..af6c244314af 100644 --- a/block/fops.c +++ b/block/fops.c @@ -863,6 +863,7 @@ const struct file_operations def_blk_fops = { .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .fallocate = blkdev_fallocate, + .fop_flags = FOP_BUFFER_RASYNC, }; static __init int blkdev_init(void) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 93ebedc5ec8c..c24ef4d3cf31 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -377,7 +377,7 @@ static const struct file_operations dax_fops = { .release = dax_release, .get_unmapped_area = dax_get_unmapped_area, .mmap = dax_mmap, - .mmap_supported_flags = MAP_SYNC, + .fop_flags = FOP_MMAP_SYNC, }; static void dev_dax_cdev_del(void *cdev) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f9d76072398d..1640c46f2153 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3719,8 +3719,7 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) { int ret; - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC | - FMODE_CAN_ODIRECT; + filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; ret = fsverity_file_open(inode, filp); if (ret) @@ -3850,6 +3849,7 @@ const struct file_operations btrfs_file_operations = { .compat_ioctl = btrfs_compat_ioctl, #endif .remap_file_range = btrfs_remap_file_range, + .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC, }; int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 54d6ff22585c..28c51b0cc4db 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -885,8 +885,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp) return ret; } - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | - FMODE_DIO_PARALLEL_WRITE; + filp->f_mode |= FMODE_NOWAIT; return dquot_file_open(inode, filp); } @@ -938,7 +937,6 @@ const struct file_operations ext4_file_operations = { .compat_ioctl = ext4_compat_ioctl, #endif .mmap = ext4_file_mmap, - .mmap_supported_flags = MAP_SYNC, .open = ext4_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, @@ -946,6 +944,8 @@ const struct file_operations ext4_file_operations = { .splice_read = ext4_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, + .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | + FOP_DIO_PARALLEL_WRITE, }; const struct inode_operations ext4_file_inode_operations = { diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 1761ad125f97..2b65e09822d4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -569,7 +569,7 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) if (err) return err; - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + filp->f_mode |= FMODE_NOWAIT; filp->f_mode |= FMODE_CAN_ODIRECT; return dquot_file_open(inode, filp); @@ -5045,4 +5045,5 @@ const struct file_operations f2fs_file_operations = { .splice_read = f2fs_file_splice_read, .splice_write = iter_file_splice_write, .fadvise = f2fs_file_fadvise, + .fop_flags = FOP_BUFFER_RASYNC, }; diff --git a/fs/read_write.c b/fs/read_write.c index d4c036e82b6c..2115d1f40bd5 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1685,7 +1685,7 @@ int generic_write_checks_count(struct kiocb *iocb, loff_t *count) if ((iocb->ki_flags & IOCB_NOWAIT) && !((iocb->ki_flags & IOCB_DIRECT) || - (file->f_mode & FMODE_BUF_WASYNC))) + (file->f_op->fop_flags & FOP_BUFFER_WASYNC))) return -EINVAL; return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 632653e00906..147439ad3581 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1230,8 +1230,7 @@ xfs_file_open( { if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; - file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC | - FMODE_DIO_PARALLEL_WRITE | FMODE_CAN_ODIRECT; + file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; return generic_file_open(inode, file); } @@ -1490,7 +1489,6 @@ const struct file_operations xfs_file_operations = { .compat_ioctl = xfs_file_compat_ioctl, #endif .mmap = xfs_file_mmap, - .mmap_supported_flags = MAP_SYNC, .open = xfs_file_open, .release = xfs_file_release, .fsync = xfs_file_fsync, @@ -1498,6 +1496,8 @@ const struct file_operations xfs_file_operations = { .fallocate = xfs_file_fallocate, .fadvise = xfs_file_fadvise, .remap_file_range = xfs_file_remap_range, + .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC | + FOP_DIO_PARALLEL_WRITE, }; const struct file_operations xfs_dir_file_operations = { @@ -1510,4 +1510,6 @@ const struct file_operations xfs_dir_file_operations = { .compat_ioctl = xfs_file_compat_ioctl, #endif .fsync = xfs_dir_fsync, + .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC | + FOP_DIO_PARALLEL_WRITE, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 914245085597..53cddf0131ca 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -163,9 +163,6 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_NOREUSE ((__force fmode_t)0x800000) -/* File supports non-exclusive O_DIRECT writes from multiple threads */ -#define FMODE_DIO_PARALLEL_WRITE ((__force fmode_t)0x1000000) - /* File is embedded in backing_file object */ #define FMODE_BACKING ((__force fmode_t)0x2000000) @@ -181,12 +178,6 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File does not contribute to nr_files count */ #define FMODE_NOACCOUNT ((__force fmode_t)0x20000000) -/* File supports async buffered reads */ -#define FMODE_BUF_RASYNC ((__force fmode_t)0x40000000) - -/* File supports async nowait buffered writes */ -#define FMODE_BUF_WASYNC ((__force fmode_t)0x80000000) - /* * Attribute flags. These should be or-ed together to figure out what * has been changed! @@ -2001,8 +1992,11 @@ struct iov_iter; struct io_uring_cmd; struct offset_ctx; +typedef unsigned int __bitwise fop_flags_t; + struct file_operations { struct module *owner; + fop_flags_t fop_flags; loff_t (*llseek) (struct file *, loff_t, int); ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); @@ -2015,7 +2009,6 @@ struct file_operations { long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); - unsigned long mmap_supported_flags; int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); @@ -2046,6 +2039,15 @@ struct file_operations { unsigned int poll_flags); } __randomize_layout; +/* Supports async buffered reads */ +#define FOP_BUFFER_RASYNC ((__force fop_flags_t)(1 << 0)) +/* Supports async buffered writes */ +#define FOP_BUFFER_WASYNC ((__force fop_flags_t)(1 << 1)) +/* Supports synchronous page faults for mappings */ +#define FOP_MMAP_SYNC ((__force fop_flags_t)(1 << 2)) +/* Supports non-exclusive O_DIRECT writes from multiple threads */ +#define FOP_DIO_PARALLEL_WRITE ((__force fop_flags_t)(1 << 3)) + /* Wrap a directory iterator that needs exclusive inode access */ int wrap_directory_iterator(struct file *, struct dir_context *, int (*) (struct file *, struct dir_context *)); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 5d4b448fdc50..d73c9ad2d2f8 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -471,7 +471,7 @@ static void io_prep_async_work(struct io_kiocb *req) /* don't serialize this request if the fs doesn't need it */ if (should_hash && (req->file->f_flags & O_DIRECT) && - (req->file->f_mode & FMODE_DIO_PARALLEL_WRITE)) + (req->file->f_op->fop_flags & FOP_DIO_PARALLEL_WRITE)) should_hash = false; if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); diff --git a/io_uring/rw.c b/io_uring/rw.c index 0585ebcc9773..d9dfde1142a1 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -683,7 +683,8 @@ static bool io_rw_should_retry(struct io_kiocb *req) * just use poll if we can, and don't attempt if the fs doesn't * support callback based unlocks */ - if (io_file_can_poll(req) || !(req->file->f_mode & FMODE_BUF_RASYNC)) + if (io_file_can_poll(req) || + !(req->file->f_op->fop_flags & FOP_BUFFER_RASYNC)) return false; wait->wait.func = io_async_buf_func; @@ -1022,10 +1023,10 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!io_file_supports_nowait(req))) goto copy_iov; - /* File path supports NOWAIT for non-direct_IO only for block devices. */ + /* Check if we can support NOWAIT. */ if (!(kiocb->ki_flags & IOCB_DIRECT) && - !(kiocb->ki_filp->f_mode & FMODE_BUF_WASYNC) && - (req->flags & REQ_F_ISREG)) + !(req->file->f_op->fop_flags & FOP_BUFFER_WASYNC) && + (req->flags & REQ_F_ISREG)) goto copy_iov; kiocb->ki_flags |= IOCB_NOWAIT; diff --git a/mm/mmap.c b/mm/mmap.c index 6dbda99a47da..3490af70f259 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1294,7 +1294,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (!file_mmap_ok(file, inode, pgoff, len)) return -EOVERFLOW; - flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags; + flags_mask = LEGACY_MAP_MASK; + if (file->f_op->fop_flags & FOP_MMAP_SYNC) + flags_mask |= MAP_SYNC; switch (flags & MAP_TYPE) { case MAP_SHARED: -- cgit v1.2.3 From 0a4f544d83990e4b7326020a802d64956fa366de Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 7 Apr 2024 15:16:55 +0200 Subject: fs: use bit shifts for FMODE_* flags Make it easier to see what bits are still available. Link: https://lore.kernel.org/r/20240406061604.GA538574@ZenIV Signed-off-by: Christian Brauner --- include/linux/fs.h | 57 +++++++++++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 26 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 53cddf0131ca..2ba98930c4c4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -110,21 +110,24 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, */ /* file is open for reading */ -#define FMODE_READ ((__force fmode_t)0x1) +#define FMODE_READ ((__force fmode_t)(1 << 0)) /* file is open for writing */ -#define FMODE_WRITE ((__force fmode_t)0x2) +#define FMODE_WRITE ((__force fmode_t)(1 << 1)) /* file is seekable */ -#define FMODE_LSEEK ((__force fmode_t)0x4) +#define FMODE_LSEEK ((__force fmode_t)(1 << 2)) /* file can be accessed using pread */ -#define FMODE_PREAD ((__force fmode_t)0x8) +#define FMODE_PREAD ((__force fmode_t)(1 << 3)) /* file can be accessed using pwrite */ -#define FMODE_PWRITE ((__force fmode_t)0x10) +#define FMODE_PWRITE ((__force fmode_t)(1 << 4)) /* File is opened for execution with sys_execve / sys_uselib */ -#define FMODE_EXEC ((__force fmode_t)0x20) +#define FMODE_EXEC ((__force fmode_t)(1 << 5)) + +/* FMODE_* bits 6 to 8 */ + /* 32bit hashes as llseek() offset (for directories) */ -#define FMODE_32BITHASH ((__force fmode_t)0x200) +#define FMODE_32BITHASH ((__force fmode_t)(1 << 9)) /* 64bit hashes as llseek() offset (for directories) */ -#define FMODE_64BITHASH ((__force fmode_t)0x400) +#define FMODE_64BITHASH ((__force fmode_t)(1 << 10)) /* * Don't update ctime and mtime. @@ -132,51 +135,53 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, * Currently a special hack for the XFS open_by_handle ioctl, but we'll * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon. */ -#define FMODE_NOCMTIME ((__force fmode_t)0x800) +#define FMODE_NOCMTIME ((__force fmode_t)(1 << 11)) /* Expect random access pattern */ -#define FMODE_RANDOM ((__force fmode_t)0x1000) +#define FMODE_RANDOM ((__force fmode_t)(1 << 12)) /* File is huge (eg. /dev/mem): treat loff_t as unsigned */ -#define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000) +#define FMODE_UNSIGNED_OFFSET ((__force fmode_t)(1 << 13)) /* File is opened with O_PATH; almost nothing can be done with it */ -#define FMODE_PATH ((__force fmode_t)0x4000) +#define FMODE_PATH ((__force fmode_t)(1 << 14)) /* File needs atomic accesses to f_pos */ -#define FMODE_ATOMIC_POS ((__force fmode_t)0x8000) +#define FMODE_ATOMIC_POS ((__force fmode_t)(1 << 15)) /* Write access to underlying fs */ -#define FMODE_WRITER ((__force fmode_t)0x10000) +#define FMODE_WRITER ((__force fmode_t)(1 << 16)) /* Has read method(s) */ -#define FMODE_CAN_READ ((__force fmode_t)0x20000) +#define FMODE_CAN_READ ((__force fmode_t)(1 << 17)) /* Has write method(s) */ -#define FMODE_CAN_WRITE ((__force fmode_t)0x40000) +#define FMODE_CAN_WRITE ((__force fmode_t)(1 << 18)) -#define FMODE_OPENED ((__force fmode_t)0x80000) -#define FMODE_CREATED ((__force fmode_t)0x100000) +#define FMODE_OPENED ((__force fmode_t)(1 << 19)) +#define FMODE_CREATED ((__force fmode_t)(1 << 20)) /* File is stream-like */ -#define FMODE_STREAM ((__force fmode_t)0x200000) +#define FMODE_STREAM ((__force fmode_t)(1 << 21)) /* File supports DIRECT IO */ -#define FMODE_CAN_ODIRECT ((__force fmode_t)0x400000) +#define FMODE_CAN_ODIRECT ((__force fmode_t)(1 << 22)) + +#define FMODE_NOREUSE ((__force fmode_t)(1 << 23)) -#define FMODE_NOREUSE ((__force fmode_t)0x800000) +/* FMODE_* bit 24 */ /* File is embedded in backing_file object */ -#define FMODE_BACKING ((__force fmode_t)0x2000000) +#define FMODE_BACKING ((__force fmode_t)(1 << 25)) /* File was opened by fanotify and shouldn't generate fanotify events */ -#define FMODE_NONOTIFY ((__force fmode_t)0x4000000) +#define FMODE_NONOTIFY ((__force fmode_t)(1 << 26)) /* File is capable of returning -EAGAIN if I/O will block */ -#define FMODE_NOWAIT ((__force fmode_t)0x8000000) +#define FMODE_NOWAIT ((__force fmode_t)(1 << 27)) /* File represents mount that needs unmounting */ -#define FMODE_NEED_UNMOUNT ((__force fmode_t)0x10000000) +#define FMODE_NEED_UNMOUNT ((__force fmode_t)(1 << 28)) /* File does not contribute to nr_files count */ -#define FMODE_NOACCOUNT ((__force fmode_t)0x20000000) +#define FMODE_NOACCOUNT ((__force fmode_t)(1 << 29)) /* * Attribute flags. These should be or-ed together to figure out what -- cgit v1.2.3 From 886b94d25a8eba4c42634dddc3cbfd6391a24d25 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 9 Apr 2024 10:41:33 +0200 Subject: fs: Add FOP_HUGE_PAGES Instead of checking for specific file_operations, add a bit to file_operations which denotes a file that only contain hugetlb pages. This lets us make hugetlbfs_file_operations static, and removes is_file_shm_hugepages() completely. Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20240407201122.3783877-1-willy@infradead.org Signed-off-by: Christian Brauner --- fs/hugetlbfs/inode.c | 5 +++-- include/linux/fs.h | 2 ++ include/linux/hugetlb.h | 8 ++------ include/linux/shm.h | 5 ----- ipc/shm.c | 10 +++------- 5 files changed, 10 insertions(+), 20 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 6502c7e776d1..34ac73cc36b1 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -40,7 +40,7 @@ #include static const struct address_space_operations hugetlbfs_aops; -const struct file_operations hugetlbfs_file_operations; +static const struct file_operations hugetlbfs_file_operations; static const struct inode_operations hugetlbfs_dir_inode_operations; static const struct inode_operations hugetlbfs_inode_operations; @@ -1301,13 +1301,14 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -const struct file_operations hugetlbfs_file_operations = { +static const struct file_operations hugetlbfs_file_operations = { .read_iter = hugetlbfs_read_iter, .mmap = hugetlbfs_file_mmap, .fsync = noop_fsync, .get_unmapped_area = hugetlb_get_unmapped_area, .llseek = default_llseek, .fallocate = hugetlbfs_fallocate, + .fop_flags = FOP_HUGE_PAGES, }; static const struct inode_operations hugetlbfs_dir_inode_operations = { diff --git a/include/linux/fs.h b/include/linux/fs.h index 2ba98930c4c4..b053f98393e6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2052,6 +2052,8 @@ struct file_operations { #define FOP_MMAP_SYNC ((__force fop_flags_t)(1 << 2)) /* Supports non-exclusive O_DIRECT writes from multiple threads */ #define FOP_DIO_PARALLEL_WRITE ((__force fop_flags_t)(1 << 3)) +/* Contains huge pages */ +#define FOP_HUGE_PAGES ((__force fop_flags_t)(1 << 4)) /* Wrap a directory iterator that needs exclusive inode access */ int wrap_directory_iterator(struct file *, struct dir_context *, diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 77b30a8c6076..b06f7c426d38 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -554,17 +554,13 @@ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); } -extern const struct file_operations hugetlbfs_file_operations; extern const struct vm_operations_struct hugetlb_vm_ops; struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, int creat_flags, int page_size_log); -static inline bool is_file_hugepages(struct file *file) +static inline bool is_file_hugepages(const struct file *file) { - if (file->f_op == &hugetlbfs_file_operations) - return true; - - return is_file_shm_hugepages(file); + return file->f_op->fop_flags & FOP_HUGE_PAGES; } static inline struct hstate *hstate_inode(struct inode *i) diff --git a/include/linux/shm.h b/include/linux/shm.h index c55bef0538e5..1d3d3ae958fb 100644 --- a/include/linux/shm.h +++ b/include/linux/shm.h @@ -16,7 +16,6 @@ struct sysv_shm { long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr, unsigned long shmlba); -bool is_file_shm_hugepages(struct file *file); void exit_shm(struct task_struct *task); #define shm_init_task(task) INIT_LIST_HEAD(&(task)->sysvshm.shm_clist) #else @@ -30,10 +29,6 @@ static inline long do_shmat(int shmid, char __user *shmaddr, { return -ENOSYS; } -static inline bool is_file_shm_hugepages(struct file *file) -{ - return false; -} static inline void exit_shm(struct task_struct *task) { } diff --git a/ipc/shm.c b/ipc/shm.c index a89f001a8bf0..3e3071252dac 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -662,8 +662,8 @@ static const struct file_operations shm_file_operations = { }; /* - * shm_file_operations_huge is now identical to shm_file_operations, - * but we keep it distinct for the sake of is_file_shm_hugepages(). + * shm_file_operations_huge is now identical to shm_file_operations + * except for fop_flags */ static const struct file_operations shm_file_operations_huge = { .mmap = shm_mmap, @@ -672,13 +672,9 @@ static const struct file_operations shm_file_operations_huge = { .get_unmapped_area = shm_get_unmapped_area, .llseek = noop_llseek, .fallocate = shm_fallocate, + .fop_flags = FOP_HUGE_PAGES, }; -bool is_file_shm_hugepages(struct file *file) -{ - return file->f_op == &shm_file_operations_huge; -} - static const struct vm_operations_struct shm_vm_ops = { .open = shm_open, /* callback for a new vm-area open */ .close = shm_close, /* callback for when the vm-area is released */ -- cgit v1.2.3 From 1f65e57dc5417b166843438bef31c70b9a5208fe Mon Sep 17 00:00:00 2001 From: Stefan Berger Date: Fri, 23 Feb 2024 12:25:12 -0500 Subject: fs: Rename SB_I_EVM_UNSUPPORTED to SB_I_EVM_HMAC_UNSUPPORTED Now that EVM supports RSA signatures for previously completely unsupported filesystems rename the flag SB_I_EVM_UNSUPPORTED to SB_I_EVM_HMAC_UNSUPPORTED to reflect that only HMAC is not supported. Suggested-by: Amir Goldstein Suggested-by: Mimi Zohar Signed-off-by: Stefan Berger Acked-by: Amir Goldstein Signed-off-by: Mimi Zohar --- fs/overlayfs/super.c | 2 +- include/linux/fs.h | 2 +- security/integrity/evm/evm_main.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index a40fc7e05525..06a231970cb5 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1460,7 +1460,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) * lead to unexpected results. */ sb->s_iflags |= SB_I_NOUMASK; - sb->s_iflags |= SB_I_EVM_UNSUPPORTED; + sb->s_iflags |= SB_I_EVM_HMAC_UNSUPPORTED; err = -ENOMEM; root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe); diff --git a/include/linux/fs.h b/include/linux/fs.h index 8dfd53b52744..7bbd5e2d2a20 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1177,7 +1177,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ #define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 #define SB_I_UNTRUSTED_MOUNTER 0x00000040 -#define SB_I_EVM_UNSUPPORTED 0x00000080 +#define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080 #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ #define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c index cfb4f9809369..c4a6081ce596 100644 --- a/security/integrity/evm/evm_main.c +++ b/security/integrity/evm/evm_main.c @@ -155,7 +155,7 @@ static int is_unsupported_fs(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); - if (inode->i_sb->s_iflags & SB_I_EVM_UNSUPPORTED) { + if (inode->i_sb->s_iflags & SB_I_EVM_HMAC_UNSUPPORTED) { pr_info_once("%s not supported\n", inode->i_sb->s_type->name); return 1; } -- cgit v1.2.3 From e964fc77577a9afe528e54b50527cf49e24aa211 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 11 Apr 2024 20:51:07 +0300 Subject: vfs, swap: compile out IS_SWAPFILE() on swapless configs No swap support -- no swapfiles possible. Signed-off-by: Alexey Dobriyan (Yandex) Link: https://lore.kernel.org/r/2391c7f5-0f83-4188-ae56-4ec7ccbf2576@p183 Signed-off-by: Christian Brauner --- include/linux/fs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index b053f98393e6..d376dd7057d5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2260,7 +2260,13 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) #define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME) + +#ifdef CONFIG_SWAP #define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE) +#else +#define IS_SWAPFILE(inode) ((void)(inode), 0U) +#endif + #define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE) #define IS_IMA(inode) ((inode)->i_flags & S_IMA) #define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT) -- cgit v1.2.3 From af58dc1f50c1946018773beca23ebaad587b9cc9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 20 Jan 2024 06:24:55 -0500 Subject: kernel_file_open(): get rid of inode argument always equal to ->dentry->d_inode of the path argument these days. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/cachefiles/namei.c | 3 +-- fs/open.c | 5 ++--- fs/overlayfs/util.c | 2 +- include/linux/fs.h | 2 +- 4 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 7ade836beb58..f53977169db4 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -563,8 +563,7 @@ static bool cachefiles_open_file(struct cachefiles_object *object, */ path.mnt = cache->mnt; path.dentry = dentry; - file = kernel_file_open(&path, O_RDWR | O_LARGEFILE | O_DIRECT, - d_backing_inode(dentry), cache->cache_cred); + file = kernel_file_open(&path, O_RDWR | O_LARGEFILE | O_DIRECT, cache->cache_cred); if (IS_ERR(file)) { trace_cachefiles_vfs_error(object, d_backing_inode(dentry), PTR_ERR(file), diff --git a/fs/open.c b/fs/open.c index ee8460c83c77..ec287ac67e7f 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1155,7 +1155,6 @@ EXPORT_SYMBOL(dentry_create); * kernel_file_open - open a file for kernel internal use * @path: path of the file to open * @flags: open flags - * @inode: the inode * @cred: credentials for open * * Open a file for use by in-kernel consumers. The file is not accounted @@ -1165,7 +1164,7 @@ EXPORT_SYMBOL(dentry_create); * Return: Opened file on success, an error pointer on failure. */ struct file *kernel_file_open(const struct path *path, int flags, - struct inode *inode, const struct cred *cred) + const struct cred *cred) { struct file *f; int error; @@ -1175,7 +1174,7 @@ struct file *kernel_file_open(const struct path *path, int flags, return f; f->f_path = *path; - error = do_dentry_open(f, inode, NULL); + error = do_dentry_open(f, d_inode(path->dentry), NULL); if (error) { fput(f); f = ERR_PTR(error); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index d285d1d7baad..edc9216f6e27 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -1376,7 +1376,7 @@ int ovl_ensure_verity_loaded(struct path *datapath) * If this inode was not yet opened, the verity info hasn't been * loaded yet, so we need to do that here to force it into memory. */ - filp = kernel_file_open(datapath, O_RDONLY, inode, current_cred()); + filp = kernel_file_open(datapath, O_RDONLY, current_cred()); if (IS_ERR(filp)) return PTR_ERR(filp); fput(filp); diff --git a/include/linux/fs.h b/include/linux/fs.h index 00fc429b0af0..143e967a3af2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1906,7 +1906,7 @@ struct file *kernel_tmpfile_open(struct mnt_idmap *idmap, umode_t mode, int open_flag, const struct cred *cred); struct file *kernel_file_open(const struct path *path, int flags, - struct inode *inode, const struct cred *cred); + const struct cred *cred); int vfs_mkobj(struct dentry *, umode_t, int (*f)(struct dentry *, umode_t, void *), -- cgit v1.2.3 From 7c98f7cb8fda964fbc60b9307ad35e94735fa35f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 28 Aug 2023 17:13:18 +0200 Subject: remove call_{read,write}_iter() functions These have no clear purpose. This is effectively a revert of commit bb7462b6fd64 ("vfs: use helpers for calling f_op->{read,write}_iter()"). The patch was created with the help of a coccinelle script. Fixes: bb7462b6fd64 ("vfs: use helpers for calling f_op->{read,write}_iter()") Reviewed-by: Christian Brauner Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- drivers/block/loop.c | 4 ++-- drivers/target/target_core_file.c | 4 ++-- fs/aio.c | 4 ++-- fs/read_write.c | 12 ++++++------ fs/splice.c | 4 ++-- include/linux/fs.h | 12 ------------ io_uring/rw.c | 4 ++-- 7 files changed, 16 insertions(+), 28 deletions(-) (limited to 'include/linux/fs.h') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 28a95fd366fe..93780f41646b 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -445,9 +445,9 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); if (rw == ITER_SOURCE) - ret = call_write_iter(file, &cmd->iocb, &iter); + ret = file->f_op->write_iter(&cmd->iocb, &iter); else - ret = call_read_iter(file, &cmd->iocb, &iter); + ret = file->f_op->read_iter(&cmd->iocb, &iter); lo_rw_aio_do_completion(cmd); diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index 4d447520bab8..94e6cd4e7e43 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -299,9 +299,9 @@ fd_execute_rw_aio(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, aio_cmd->iocb.ki_flags |= IOCB_DSYNC; if (is_write) - ret = call_write_iter(file, &aio_cmd->iocb, &iter); + ret = file->f_op->write_iter(&aio_cmd->iocb, &iter); else - ret = call_read_iter(file, &aio_cmd->iocb, &iter); + ret = file->f_op->read_iter(&aio_cmd->iocb, &iter); if (ret != -EIOCBQUEUED) cmd_rw_aio_complete(&aio_cmd->iocb, ret); diff --git a/fs/aio.c b/fs/aio.c index 9cdaa2faa536..13ca81c7c744 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1605,7 +1605,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb, return ret; ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) - aio_rw_done(req, call_read_iter(file, req, &iter)); + aio_rw_done(req, file->f_op->read_iter(req, &iter)); kfree(iovec); return ret; } @@ -1636,7 +1636,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, if (S_ISREG(file_inode(file)->i_mode)) kiocb_start_write(req); req->ki_flags |= IOCB_WRITE; - aio_rw_done(req, call_write_iter(file, req, &iter)); + aio_rw_done(req, file->f_op->write_iter(req, &iter)); } kfree(iovec); return ret; diff --git a/fs/read_write.c b/fs/read_write.c index d4c036e82b6c..2de7f6adb33d 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -392,7 +392,7 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_DEST, buf, len); - ret = call_read_iter(filp, &kiocb, &iter); + ret = filp->f_op->read_iter(&kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; @@ -494,7 +494,7 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len); - ret = call_write_iter(filp, &kiocb, &iter); + ret = filp->f_op->write_iter(&kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ret > 0 && ppos) *ppos = kiocb.ki_pos; @@ -736,9 +736,9 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, kiocb.ki_pos = (ppos ? *ppos : 0); if (type == READ) - ret = call_read_iter(filp, &kiocb, iter); + ret = filp->f_op->read_iter(&kiocb, iter); else - ret = call_write_iter(filp, &kiocb, iter); + ret = filp->f_op->write_iter(&kiocb, iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; @@ -799,7 +799,7 @@ ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, if (ret < 0) return ret; - ret = call_read_iter(file, iocb, iter); + ret = file->f_op->read_iter(iocb, iter); out: if (ret >= 0) fsnotify_access(file); @@ -860,7 +860,7 @@ ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, return ret; kiocb_start_write(iocb); - ret = call_write_iter(file, iocb, iter); + ret = file->f_op->write_iter(iocb, iter); if (ret != -EIOCBQUEUED) kiocb_end_write(iocb); if (ret > 0) diff --git a/fs/splice.c b/fs/splice.c index 218e24b1ac40..60aed8de21f8 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -362,7 +362,7 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos, iov_iter_bvec(&to, ITER_DEST, bv, npages, len); init_sync_kiocb(&kiocb, in); kiocb.ki_pos = *ppos; - ret = call_read_iter(in, &kiocb, &to); + ret = in->f_op->read_iter(&kiocb, &to); if (ret > 0) { keep = DIV_ROUND_UP(ret, PAGE_SIZE); @@ -740,7 +740,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left); init_sync_kiocb(&kiocb, out); kiocb.ki_pos = sd.pos; - ret = call_write_iter(out, &kiocb, &from); + ret = out->f_op->write_iter(&kiocb, &from); sd.pos = kiocb.ki_pos; if (ret <= 0) break; diff --git a/include/linux/fs.h b/include/linux/fs.h index 143e967a3af2..a94343f567cb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2096,18 +2096,6 @@ struct inode_operations { struct offset_ctx *(*get_offset_ctx)(struct inode *inode); } ____cacheline_aligned; -static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio, - struct iov_iter *iter) -{ - return file->f_op->read_iter(kio, iter); -} - -static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio, - struct iov_iter *iter) -{ - return file->f_op->write_iter(kio, iter); -} - static inline int call_mmap(struct file *file, struct vm_area_struct *vma) { return file->f_op->mmap(file, vma); diff --git a/io_uring/rw.c b/io_uring/rw.c index 0585ebcc9773..7d335b7e00ed 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -701,7 +701,7 @@ static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter) struct file *file = rw->kiocb.ki_filp; if (likely(file->f_op->read_iter)) - return call_read_iter(file, &rw->kiocb, iter); + return file->f_op->read_iter(&rw->kiocb, iter); else if (file->f_op->read) return loop_rw_iter(READ, rw, iter); else @@ -1047,7 +1047,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) kiocb->ki_flags |= IOCB_WRITE; if (likely(req->file->f_op->write_iter)) - ret2 = call_write_iter(req->file, kiocb, &s->iter); + ret2 = req->file->f_op->write_iter(kiocb, &s->iter); else if (req->file->f_op->write) ret2 = loop_rw_iter(WRITE, rw, &s->iter); else -- cgit v1.2.3 From 5b9932f6001c70b984e8c9c2fe09e443beb4baba Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 15 Apr 2024 14:54:13 -0700 Subject: vfs: export remap and write check helpers Export these functions so that the next patch can use them to check the file ranges being passed to the XFS_IOC_EXCHANGE_RANGE operation. Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/read_write.c | 1 + fs/remap_range.c | 4 ++-- include/linux/fs.h | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/read_write.c b/fs/read_write.c index d4c036e82b6c..85c096f2c0d0 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1667,6 +1667,7 @@ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) return 0; } +EXPORT_SYMBOL_GPL(generic_write_check_limits); /* Like generic_write_checks(), but takes size of write instead of iter. */ int generic_write_checks_count(struct kiocb *iocb, loff_t *count) diff --git a/fs/remap_range.c b/fs/remap_range.c index de07f978ce3e..28246dfc8485 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -99,8 +99,7 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in, return 0; } -static int remap_verify_area(struct file *file, loff_t pos, loff_t len, - bool write) +int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write) { int mask = write ? MAY_WRITE : MAY_READ; loff_t tmp; @@ -118,6 +117,7 @@ static int remap_verify_area(struct file *file, loff_t pos, loff_t len, return fsnotify_file_area_perm(file, mask, &pos, len); } +EXPORT_SYMBOL_GPL(remap_verify_area); /* * Ensure that we don't remap a partial EOF block in the middle of something diff --git a/include/linux/fs.h b/include/linux/fs.h index 8dfd53b52744..0835faeebe7b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2119,6 +2119,7 @@ extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, loff_t, size_t, unsigned int); +int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write); int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags, -- cgit v1.2.3 From 5a1a25be995e1014abd01600479915683e356f5c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 15 Apr 2024 11:20:55 -0400 Subject: libfs: Add simple_offset_rename() API I'm about to fix a tmpfs rename bug that requires the use of internal simple_offset helpers that are not available in mm/shmem.c Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/20240415152057.4605-3-cel@kernel.org Signed-off-by: Christian Brauner --- fs/libfs.c | 21 +++++++++++++++++++++ include/linux/fs.h | 2 ++ mm/shmem.c | 3 +-- 3 files changed, 24 insertions(+), 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/libfs.c b/fs/libfs.c index ab61fae92cde..c392a6edd393 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -357,6 +357,27 @@ int simple_offset_empty(struct dentry *dentry) return ret; } +/** + * simple_offset_rename - handle directory offsets for rename + * @old_dir: parent directory of source entry + * @old_dentry: dentry of source entry + * @new_dir: parent_directory of destination entry + * @new_dentry: dentry of destination + * + * Caller provides appropriate serialization. + * + * Returns zero on success, a negative errno value on failure. + */ +int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); + struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); + + simple_offset_remove(old_ctx, old_dentry); + return simple_offset_add(new_ctx, old_dentry); +} + /** * simple_offset_rename_exchange - exchange rename with directory offsets * @old_dir: parent of dentry being moved diff --git a/include/linux/fs.h b/include/linux/fs.h index 00fc429b0af0..82c707e40fea 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3338,6 +3338,8 @@ void simple_offset_init(struct offset_ctx *octx); int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry); void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry); int simple_offset_empty(struct dentry *dentry); +int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry); int simple_offset_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, diff --git a/mm/shmem.c b/mm/shmem.c index 0aad0d9a621b..c0fb65223963 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3473,8 +3473,7 @@ static int shmem_rename2(struct mnt_idmap *idmap, return error; } - simple_offset_remove(shmem_get_offset_ctx(old_dir), old_dentry); - error = simple_offset_add(shmem_get_offset_ctx(new_dir), old_dentry); + error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry); if (error) return error; -- cgit v1.2.3 From a5674119f0faa46f6ebdfcfa92342535b7f54e67 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 21 Mar 2024 09:36:27 -0700 Subject: fs: convert alloc_inode_sb() to a macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We're introducing alloc tagging, which tracks memory allocations by callsite. Converting alloc_inode_sb() to a macro means allocations will be tracked by its caller, which is a bit more useful. Link: https://lkml.kernel.org/r/20240321163705.3067592-6-surenb@google.com Signed-off-by: Kent Overstreet Signed-off-by: Suren Baghdasaryan Cc: Alexander Viro Reviewed-by: Kees Cook Reviewed-by: Pasha Tatashin Tested-by: Kees Cook Cc: Alex Gaynor Cc: Alice Ryhl Cc: Andreas Hindborg Cc: Benno Lossin Cc: "Björn Roy Baron" Cc: Boqun Feng Cc: Christoph Lameter Cc: Dennis Zhou Cc: Gary Guo Cc: Miguel Ojeda Cc: Peter Zijlstra Cc: Tejun Heo Cc: Vlastimil Babka Cc: Wedson Almeida Filho Signed-off-by: Andrew Morton --- include/linux/fs.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 8dfd53b52744..4f2e2e5b6ab1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3085,11 +3085,7 @@ int setattr_should_drop_sgid(struct mnt_idmap *idmap, * This must be used for allocating filesystems specific inodes to set * up the inode reclaim context correctly. */ -static inline void * -alloc_inode_sb(struct super_block *sb, struct kmem_cache *cache, gfp_t gfp) -{ - return kmem_cache_alloc_lru(cache, &sb->s_inode_lru, gfp); -} +#define alloc_inode_sb(_sb, _cache, _gfp) kmem_cache_alloc_lru(_cache, &_sb->s_inode_lru, _gfp) extern void __insert_inode_hash(struct inode *, unsigned long hashval); static inline void insert_inode_hash(struct inode *inode) -- cgit v1.2.3 From f6bdc7865ef4a7a4393c90a550c728231ebc853e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 3 May 2024 13:16:25 -0700 Subject: fs: WARN when f_count resurrection is attempted It should never happen that get_file() is called on a file with f_count equal to zero. If this happens, a use-after-free condition has happened[1], and we need to attempt a best-effort reporting of the situation to help find the root cause more easily. Additionally, this serves as a data corruption indicator that system owners using warn_limit or panic_on_warn would like to have detected. Link: https://lore.kernel.org/lkml/7c41cf3c-2a71-4dbb-8f34-0337890906fc@gmail.com/ [1] Suggested-by: Peter Zijlstra Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20240503201620.work.651-kees@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index e5f643e97cfd..5b351c1e6f58 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1034,7 +1034,8 @@ struct file_handle { static inline struct file *get_file(struct file *f) { - atomic_long_inc(&f->f_count); + long prior = atomic_long_fetch_inc_relaxed(&f->f_count); + WARN_ONCE(!prior, "struct file::f_count incremented from zero; use-after-free condition present!\n"); return f; } -- cgit v1.2.3