diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/aio.c | 1 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 7 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 2 | ||||
-rw-r--r-- | fs/buffer.c | 4 | ||||
-rw-r--r-- | fs/char_dev.c | 2 | ||||
-rw-r--r-- | fs/coredump.c | 1 | ||||
-rw-r--r-- | fs/dcache.c | 16 | ||||
-rw-r--r-- | fs/eventpoll.c | 11 | ||||
-rw-r--r-- | fs/ext4/page-io.c | 2 | ||||
-rw-r--r-- | fs/f2fs/data.c | 9 | ||||
-rw-r--r-- | fs/fcntl.c | 3 | ||||
-rw-r--r-- | fs/freevxfs/vxfs_dir.h | 2 | ||||
-rw-r--r-- | fs/fs-writeback.c | 40 | ||||
-rw-r--r-- | fs/fs_parser.c | 1 | ||||
-rw-r--r-- | fs/gfs2/export.c | 1 | ||||
-rw-r--r-- | fs/gfs2/file.c | 2 | ||||
-rw-r--r-- | fs/hfsplus/hfsplus_fs.h | 3 | ||||
-rw-r--r-- | fs/hfsplus/wrapper.c | 2 | ||||
-rw-r--r-- | fs/hugetlbfs/inode.c | 17 | ||||
-rw-r--r-- | fs/inode.c | 29 | ||||
-rw-r--r-- | fs/iomap/buffered-io.c | 2 | ||||
-rw-r--r-- | fs/lockd/svclock.c | 7 | ||||
-rw-r--r-- | fs/mpage.c | 2 | ||||
-rw-r--r-- | fs/namei.c | 41 | ||||
-rw-r--r-- | fs/namespace.c | 161 | ||||
-rw-r--r-- | fs/nfsd/nfs4state.c | 19 | ||||
-rw-r--r-- | fs/ocfs2/export.c | 1 | ||||
-rw-r--r-- | fs/ocfs2/file.c | 2 | ||||
-rw-r--r-- | fs/posix_acl.c | 13 | ||||
-rw-r--r-- | fs/seq_file.c | 2 |
30 files changed, 321 insertions, 84 deletions
@@ -2191,7 +2191,6 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, return -EINVAL; spin_lock_irq(&ctx->ctx_lock); - /* TODO: use a hash or array, this sucks. */ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { if (kiocb->ki_res.obj == obj) { ret = kiocb->ki_cancel(&kiocb->rw); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 872cca54cc6c..42c9899d9241 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -786,7 +786,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, } if (bio_ctrl->wbc) - wbc_account_cgroup_owner(bio_ctrl->wbc, &folio->page, + wbc_account_cgroup_owner(bio_ctrl->wbc, folio, len); size -= len; @@ -1708,7 +1708,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, ret = bio_add_folio(&bbio->bio, folio, eb->len, eb->start - folio_pos(folio)); ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio_page(folio, 0), eb->len); + wbc_account_cgroup_owner(wbc, folio, eb->len); folio_unlock(folio); } else { int num_folios = num_extent_folios(eb); @@ -1722,8 +1722,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, folio_start_writeback(folio); ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio_page(folio, 0), - eb->folio_size); + wbc_account_cgroup_owner(wbc, folio, eb->folio_size); wbc->nr_to_write -= folio_nr_pages(folio); folio_unlock(folio); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1e4ca1e7d2e5..df20f2e7ac1e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1729,7 +1729,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, * need full accuracy. Just account the whole thing * against the first page. */ - wbc_account_cgroup_owner(wbc, &locked_folio->page, + wbc_account_cgroup_owner(wbc, locked_folio, cur_end - start); async_chunk[i].locked_folio = locked_folio; locked_folio = NULL; diff --git a/fs/buffer.c b/fs/buffer.c index 1fc9a50def0b..32bd0f4c4223 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2803,7 +2803,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_write_hint = write_hint; - __bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); + bio_add_folio_nofail(bio, bh->b_folio, bh->b_size, bh_offset(bh)); bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; @@ -2813,7 +2813,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, if (wbc) { wbc_init_bio(wbc, bio); - wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size); + wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size); } submit_bio(bio); diff --git a/fs/char_dev.c b/fs/char_dev.c index 57cc096c498a..c2ddb998f3c9 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -562,8 +562,8 @@ int cdev_device_add(struct cdev *cdev, struct device *dev) /** * cdev_device_del() - inverse of cdev_device_add - * @dev: the device structure * @cdev: the cdev structure + * @dev: the device structure * * cdev_device_del() is a helper function to call cdev_del and device_del. * It should be used whenever cdev_device_add is used. diff --git a/fs/coredump.c b/fs/coredump.c index 45737b43dda5..d48edb37bc35 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -951,6 +951,7 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, } else { dump_skip(cprm, PAGE_SIZE); } + cond_resched(); } dump_page_free(dump_page); return 1; diff --git a/fs/dcache.c b/fs/dcache.c index 0f6b16ba30d0..0099077a2982 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -135,6 +135,7 @@ struct dentry_stat_t { static DEFINE_PER_CPU(long, nr_dentry); static DEFINE_PER_CPU(long, nr_dentry_unused); static DEFINE_PER_CPU(long, nr_dentry_negative); +static int dentry_negative_policy; #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) /* Statistics gathering. */ @@ -199,6 +200,15 @@ static struct ctl_table fs_dcache_sysctls[] = { .mode = 0444, .proc_handler = proc_nr_dentry, }, + { + .procname = "dentry-negative", + .data = &dentry_negative_policy, + .maxlen = sizeof(dentry_negative_policy), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, }; static int __init init_fs_dcache_sysctls(void) @@ -2039,8 +2049,8 @@ EXPORT_SYMBOL(d_obtain_root); /** * d_add_ci - lookup or allocate new dentry with case-exact name - * @inode: the inode case-insensitive lookup has found * @dentry: the negative dentry that was passed to the parent's lookup func + * @inode: the inode case-insensitive lookup has found * @name: the case-exact name to be associated with the returned dentry * * This is to avoid filling the dcache with case-insensitive names to the @@ -2093,8 +2103,8 @@ EXPORT_SYMBOL(d_add_ci); /** * d_same_name - compare dentry name with case-exact name - * @parent: parent dentry * @dentry: the negative dentry that was passed to the parent's lookup func + * @parent: parent dentry * @name: the case-exact name to be associated with the returned dentry * * Return: true if names are same, or false @@ -2401,6 +2411,8 @@ void d_delete(struct dentry * dentry) * Are we the only user? */ if (dentry->d_lockref.count == 1) { + if (dentry_negative_policy) + __d_drop(dentry); dentry->d_flags &= ~DCACHE_CANT_MOUNT; dentry_unlink_inode(dentry); } else { diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1ae4542f0bd8..1a06e462b6ef 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -823,7 +823,8 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) to_free = NULL; head = file->f_ep; if (head->first == &epi->fllink && !epi->fllink.next) { - file->f_ep = NULL; + /* See eventpoll_release() for details. */ + WRITE_ONCE(file->f_ep, NULL); if (!is_file_epoll(file)) { struct epitems_head *v; v = container_of(head, struct epitems_head, epitems); @@ -1372,7 +1373,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v break; } } - wake_up(&ep->wq); + if (sync) + wake_up_sync(&ep->wq); + else + wake_up(&ep->wq); } if (waitqueue_active(&ep->poll_wait)) pwake++; @@ -1603,7 +1607,8 @@ allocate: spin_unlock(&file->f_lock); goto allocate; } - file->f_ep = head; + /* See eventpoll_release() for details. */ + WRITE_ONCE(file->f_ep, head); to_free = NULL; } hlist_add_head_rcu(&epi->fllink, file->f_ep); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index ad5543866d21..b7b9261fec3b 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -421,7 +421,7 @@ submit_and_retry: io_submit_init_bio(io, bh); if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh))) goto submit_and_retry; - wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size); + wbc_account_cgroup_owner(io->io_wbc, folio, bh->b_size); io->io_next_block++; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 94f7b084f601..e3ce763cce18 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -711,7 +711,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) } if (fio->io_wbc && !is_read_io(fio->op)) - wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), + PAGE_SIZE); inc_page_count(fio->sbi, is_read_io(fio->op) ? __read_io_type(page) : WB_DATA_TYPE(fio->page, false)); @@ -911,7 +912,8 @@ alloc_new: } if (fio->io_wbc) - wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), + PAGE_SIZE); inc_page_count(fio->sbi, WB_DATA_TYPE(page, false)); @@ -1011,7 +1013,8 @@ alloc_new: } if (fio->io_wbc) - wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), + PAGE_SIZE); io->last_block_in_bio = fio->new_blkaddr; diff --git a/fs/fcntl.c b/fs/fcntl.c index 22dd9dcce7ec..3d89de31066a 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -397,6 +397,9 @@ static long f_dupfd_query(int fd, struct file *filp) { CLASS(fd_raw, f)(fd); + if (fd_empty(f)) + return -EBADF; + /* * We can do the 'fdput()' immediately, as the only thing that * matters is the pointer value which isn't changed by the fdput. diff --git a/fs/freevxfs/vxfs_dir.h b/fs/freevxfs/vxfs_dir.h index fbcd603365ad..8c67627f2a3d 100644 --- a/fs/freevxfs/vxfs_dir.h +++ b/fs/freevxfs/vxfs_dir.h @@ -25,7 +25,7 @@ struct vxfs_dirblk { __fs16 d_free; /* free space in dirblock */ __fs16 d_nhash; /* no of hash chains */ - __fs16 d_hash[1]; /* hash chain */ + __fs16 d_hash[]; /* hash chain */ }; /* diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d8bec3c1bb1f..3cd99e2dc6ac 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -290,7 +290,6 @@ void __inode_attach_wb(struct inode *inode, struct folio *folio) if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) wb_put(wb); } -EXPORT_SYMBOL_GPL(__inode_attach_wb); /** * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list @@ -731,8 +730,9 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) * writeback completion, wbc_detach_inode() should be called. This is used * to track the cgroup writeback context. */ -void wbc_attach_and_unlock_inode(struct writeback_control *wbc, - struct inode *inode) +static void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) + __releases(&inode->i_lock) { if (!inode_cgwb_enabled(inode)) { spin_unlock(&inode->i_lock); @@ -762,7 +762,24 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css))) inode_switch_wbs(inode, wbc->wb_id); } -EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode); + +/** + * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite + * @wbc: writeback_control of interest + * @inode: target inode + * + * This function is to be used by __filemap_fdatawrite_range(), which is an + * alternative entry point into writeback code, and first ensures @inode is + * associated with a bdi_writeback and attaches it to @wbc. + */ +void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, + struct inode *inode) +{ + spin_lock(&inode->i_lock); + inode_attach_wb(inode, NULL); + wbc_attach_and_unlock_inode(wbc, inode); +} +EXPORT_SYMBOL_GPL(wbc_attach_fdatawrite_inode); /** * wbc_detach_inode - disassociate wbc from inode and perform foreign detection @@ -890,17 +907,16 @@ EXPORT_SYMBOL_GPL(wbc_detach_inode); /** * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership * @wbc: writeback_control of the writeback in progress - * @page: page being written out + * @folio: folio being written out * @bytes: number of bytes being written out * - * @bytes from @page are about to written out during the writeback + * @bytes from @folio are about to written out during the writeback * controlled by @wbc. Keep the book for foreign inode detection. See * wbc_detach_inode(). */ -void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, +void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio, size_t bytes) { - struct folio *folio; struct cgroup_subsys_state *css; int id; @@ -913,7 +929,6 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, if (!wbc->wb || wbc->no_cgroup_owner) return; - folio = page_folio(page); css = mem_cgroup_css_from_folio(folio); /* dead cgroups shouldn't contribute to inode ownership arbitration */ if (!(css->flags & CSS_ONLINE)) @@ -1227,6 +1242,13 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, } } +static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) + __releases(&inode->i_lock) +{ + spin_unlock(&inode->i_lock); +} + #endif /* CONFIG_CGROUP_WRITEBACK */ /* diff --git a/fs/fs_parser.c b/fs/fs_parser.c index 24727ec34e5a..698464f3e26a 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -156,6 +156,7 @@ int fs_lookup_param(struct fs_context *fc, f = getname_kernel(param->string); if (IS_ERR(f)) return PTR_ERR(f); + param->dirfd = AT_FDCWD; put_f = true; break; case fs_value_is_filename: diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index d418d8b5367f..3334c394ce9c 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -190,6 +190,5 @@ const struct export_operations gfs2_export_ops = { .fh_to_parent = gfs2_fh_to_parent, .get_name = gfs2_get_name, .get_parent = gfs2_get_parent, - .flags = EXPORT_OP_ASYNC_LOCK, }; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index f7dd64856c9b..1e73cf87ff88 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1586,6 +1586,7 @@ const struct file_operations gfs2_file_fops = { .splice_write = gfs2_file_splice_write, .setlease = simple_nosetlease, .fallocate = gfs2_fallocate, + .fop_flags = FOP_ASYNC_LOCK, }; const struct file_operations gfs2_dir_fops = { @@ -1598,6 +1599,7 @@ const struct file_operations gfs2_dir_fops = { .lock = gfs2_lock, .flock = gfs2_flock, .llseek = default_llseek, + .fop_flags = FOP_ASYNC_LOCK, }; #endif /* CONFIG_GFS2_FS_LOCKING_DLM */ diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index b733ef0c1547..2f089bff0095 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -157,6 +157,7 @@ struct hfsplus_sb_info { /* Runtime variables */ u32 blockoffset; + u32 min_io_size; sector_t part_start; sector_t sect_count; int fs_shift; @@ -308,7 +309,7 @@ struct hfsplus_readdir_data { */ static inline unsigned short hfsplus_min_io_size(struct super_block *sb) { - return max_t(unsigned short, bdev_logical_block_size(sb->s_bdev), + return max_t(unsigned short, HFSPLUS_SB(sb)->min_io_size, HFSPLUS_SECTOR_SIZE); } diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 9592ffcb44e5..74801911bc1c 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -172,6 +172,8 @@ int hfsplus_read_wrapper(struct super_block *sb) if (!blocksize) goto out; + sbi->min_io_size = blocksize; + if (hfsplus_get_last_session(sb, &part_start, &part_size)) goto out; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 5cf327337e22..2dea122e5b93 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -39,6 +39,9 @@ #include <linux/uaccess.h> #include <linux/sched/mm.h> +#define CREATE_TRACE_POINTS +#include <trace/events/hugetlbfs.h> + static const struct address_space_operations hugetlbfs_aops; static const struct file_operations hugetlbfs_file_operations; static const struct inode_operations hugetlbfs_dir_inode_operations; @@ -687,6 +690,7 @@ static void hugetlbfs_evict_inode(struct inode *inode) { struct resv_map *resv_map; + trace_hugetlbfs_evict_inode(inode); remove_inode_hugepages(inode, 0, LLONG_MAX); /* @@ -814,8 +818,10 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; - if (mode & FALLOC_FL_PUNCH_HOLE) - return hugetlbfs_punch_hole(inode, offset, len); + if (mode & FALLOC_FL_PUNCH_HOLE) { + error = hugetlbfs_punch_hole(inode, offset, len); + goto out_nolock; + } /* * Default preallocate case. @@ -919,6 +925,9 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, inode_set_ctime_current(inode); out: inode_unlock(inode); + +out_nolock: + trace_hugetlbfs_fallocate(inode, mode, offset, len, error); return error; } @@ -935,6 +944,8 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap, if (error) return error; + trace_hugetlbfs_setattr(inode, dentry, attr); + if (ia_valid & ATTR_SIZE) { loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; @@ -1033,6 +1044,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, break; } lockdep_annotate_inode_mutex_key(inode); + trace_hugetlbfs_alloc_inode(inode, dir, mode); } else { if (resv_map) kref_put(&resv_map->refs, resv_map_release); @@ -1272,6 +1284,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) static void hugetlbfs_free_inode(struct inode *inode) { + trace_hugetlbfs_free_inode(inode); kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); } diff --git a/fs/inode.c b/fs/inode.c index 70a2f8c717e0..b13b778257ae 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -819,7 +819,7 @@ static void evict(struct inode *inode) * ___wait_var_event() either sees the bit cleared or * waitqueue_active() check in wake_up_var() sees the waiter. */ - smp_mb(); + smp_mb__after_spinlock(); inode_wake_up_bit(inode, __I_NEW); BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); spin_unlock(&inode->i_lock); @@ -1312,16 +1312,15 @@ EXPORT_SYMBOL(unlock_two_nondirectories); * @data: opaque data pointer to pass to @test and @set * * Search for the inode specified by @hashval and @data in the inode cache, - * and if present it is return it with an increased reference count. This is - * a variant of iget5_locked() for callers that don't want to fail on memory - * allocation of inode. + * and if present return it with an increased reference count. This is a + * variant of iget5_locked() that doesn't allocate an inode. * - * If the inode is not in cache, insert the pre-allocated inode to cache and + * If the inode is not present in the cache, insert the pre-allocated inode and * return it locked, hashed, and with the I_NEW flag set. The file system gets * to fill it in before unlocking it via unlock_new_inode(). * - * Note both @test and @set are called with the inode_hash_lock held, so can't - * sleep. + * Note that both @test and @set are called with the inode_hash_lock held, so + * they can't sleep. */ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), @@ -1385,16 +1384,16 @@ EXPORT_SYMBOL(inode_insert5); * @data: opaque data pointer to pass to @test and @set * * Search for the inode specified by @hashval and @data in the inode cache, - * and if present it is return it with an increased reference count. This is - * a generalized version of iget_locked() for file systems where the inode + * and if present return it with an increased reference count. This is a + * generalized version of iget_locked() for file systems where the inode * number is not sufficient for unique identification of an inode. * - * If the inode is not in cache, allocate a new inode and return it locked, - * hashed, and with the I_NEW flag set. The file system gets to fill it in - * before unlocking it via unlock_new_inode(). + * If the inode is not present in the cache, allocate and insert a new inode + * and return it locked, hashed, and with the I_NEW flag set. The file system + * gets to fill it in before unlocking it via unlock_new_inode(). * - * Note both @test and @set are called with the inode_hash_lock held, so can't - * sleep. + * Note that both @test and @set are called with the inode_hash_lock held, so + * they can't sleep. */ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), @@ -2912,7 +2911,7 @@ EXPORT_SYMBOL(inode_set_ctime_deleg); * @inode: inode to check * @vfsgid: the new/current vfsgid of @inode * - * Check wether @vfsgid is in the caller's group list or if the caller is + * Check whether @vfsgid is in the caller's group list or if the caller is * privileged with CAP_FSETID over @inode. This can be used to determine * whether the setgid bit can be kept or must be dropped. * diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index ef0b68bccbb6..ce73d2a48c1e 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1784,7 +1784,7 @@ new_ioend: if (ifs) atomic_add(len, &ifs->write_bytes_pending); wpc->ioend->io_size += len; - wbc_account_cgroup_owner(wbc, &folio->page, len); + wbc_account_cgroup_owner(wbc, folio, len); return 0; } diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 1f2149db10f2..2359347c9fbd 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -30,7 +30,6 @@ #include <linux/sunrpc/svc_xprt.h> #include <linux/lockd/nlm.h> #include <linux/lockd/lockd.h> -#include <linux/exportfs.h> #define NLMDBG_FACILITY NLMDBG_SVCLOCK @@ -481,7 +480,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_host *host, struct nlm_lock *lock, int wait, struct nlm_cookie *cookie, int reclaim) { - struct inode *inode = nlmsvc_file_inode(file); + struct inode *inode __maybe_unused = nlmsvc_file_inode(file); struct nlm_block *block = NULL; int error; int mode; @@ -496,7 +495,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, (long long)lock->fl.fl_end, wait); - if (!exportfs_lock_op_is_async(inode->i_sb->s_export_op)) { + if (!locks_can_async_lock(nlmsvc_file_file(file)->f_op)) { async_block = wait; wait = 0; } @@ -550,7 +549,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, * requests on the underlaying ->lock() implementation but * only one nlm_block to being granted by lm_grant(). */ - if (exportfs_lock_op_is_async(inode->i_sb->s_export_op) && + if (locks_can_async_lock(nlmsvc_file_file(file)->f_op) && !list_empty(&block->b_list)) { spin_unlock(&nlm_blocked_lock); ret = nlm_lck_blocked; diff --git a/fs/mpage.c b/fs/mpage.c index b5b5ddf9d513..82aecf372743 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -606,7 +606,7 @@ alloc_new: * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */ - wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio)); + wbc_account_cgroup_owner(wbc, folio, folio_size(folio)); length = first_unmapped << blkbits; if (!bio_add_folio(bio, folio, length, 0)) { bio = mpage_bio_submit_write(bio); diff --git a/fs/namei.c b/fs/namei.c index 4a4a22a08ac2..05a8d544fb35 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -326,6 +326,25 @@ static int check_acl(struct mnt_idmap *idmap, return -EAGAIN; } +/* + * Very quick optimistic "we know we have no ACL's" check. + * + * Note that this is purely for ACL_TYPE_ACCESS, and purely + * for the "we have cached that there are no ACLs" case. + * + * If this returns true, we know there are no ACLs. But if + * it returns false, we might still not have ACLs (it could + * be the is_uncached_acl() case). + */ +static inline bool no_acl_inode(struct inode *inode) +{ +#ifdef CONFIG_FS_POSIX_ACL + return likely(!READ_ONCE(inode->i_acl)); +#else + return true; +#endif +} + /** * acl_permission_check - perform basic UNIX permission checking * @idmap: idmap of the mount the inode was found from @@ -348,6 +367,28 @@ static int acl_permission_check(struct mnt_idmap *idmap, unsigned int mode = inode->i_mode; vfsuid_t vfsuid; + /* + * Common cheap case: everybody has the requested + * rights, and there are no ACLs to check. No need + * to do any owner/group checks in that case. + * + * - 'mask&7' is the requested permission bit set + * - multiplying by 0111 spreads them out to all of ugo + * - '& ~mode' looks for missing inode permission bits + * - the '!' is for "no missing permissions" + * + * After that, we just need to check that there are no + * ACL's on the inode - do the 'IS_POSIXACL()' check last + * because it will dereference the ->i_sb pointer and we + * want to avoid that if at all possible. + */ + if (!((mask & 7) * 0111 & ~mode)) { + if (no_acl_inode(inode)) + return 0; + if (!IS_POSIXACL(inode)) + return 0; + } + /* Are we the owner? If so, ACL's don't matter */ vfsuid = i_uid_into_vfsuid(idmap, inode); if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) { diff --git a/fs/namespace.c b/fs/namespace.c index d26f5e6d2ca3..206fc54feeba 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3901,7 +3901,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a } new_ns->ns.ops = &mntns_operations; if (!anon) - new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); + new_ns->seq = atomic64_inc_return(&mnt_ns_seq); refcount_set(&new_ns->ns.count, 1); refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; @@ -5006,6 +5006,40 @@ static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq) return 0; } +static void statmount_fs_subtype(struct kstatmount *s, struct seq_file *seq) +{ + struct super_block *sb = s->mnt->mnt_sb; + + if (sb->s_subtype) + seq_puts(seq, sb->s_subtype); +} + +static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq) +{ + struct super_block *sb = s->mnt->mnt_sb; + struct mount *r = real_mount(s->mnt); + + if (sb->s_op->show_devname) { + size_t start = seq->count; + int ret; + + ret = sb->s_op->show_devname(seq, s->mnt->mnt_root); + if (ret) + return ret; + + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; + + /* Unescape the result */ + seq->buf[seq->count] = '\0'; + seq->count = start; + seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL)); + } else if (r->mnt_devname) { + seq_puts(seq, r->mnt_devname); + } + return 0; +} + static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns) { s->sm.mask |= STATMOUNT_MNT_NS_ID; @@ -5040,35 +5074,134 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) return 0; } +static inline int statmount_opt_unescape(struct seq_file *seq, char *buf_start) +{ + char *buf_end, *opt_start, *opt_end; + int count = 0; + + buf_end = seq->buf + seq->count; + *buf_end = '\0'; + for (opt_start = buf_start + 1; opt_start < buf_end; opt_start = opt_end + 1) { + opt_end = strchrnul(opt_start, ','); + *opt_end = '\0'; + buf_start += string_unescape(opt_start, buf_start, 0, UNESCAPE_OCTAL) + 1; + if (WARN_ON_ONCE(++count == INT_MAX)) + return -EOVERFLOW; + } + seq->count = buf_start - 1 - seq->buf; + return count; +} + +static int statmount_opt_array(struct kstatmount *s, struct seq_file *seq) +{ + struct vfsmount *mnt = s->mnt; + struct super_block *sb = mnt->mnt_sb; + size_t start = seq->count; + char *buf_start; + int err; + + if (!sb->s_op->show_options) + return 0; + + buf_start = seq->buf + start; + err = sb->s_op->show_options(seq, mnt->mnt_root); + if (err) + return err; + + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; + + if (seq->count == start) + return 0; + + err = statmount_opt_unescape(seq, buf_start); + if (err < 0) + return err; + + s->sm.opt_num = err; + return 0; +} + +static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq) +{ + struct vfsmount *mnt = s->mnt; + struct super_block *sb = mnt->mnt_sb; + size_t start = seq->count; + char *buf_start; + int err; + + buf_start = seq->buf + start; + + err = security_sb_show_options(seq, sb); + if (!err) + return err; + + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; + + if (seq->count == start) + return 0; + + err = statmount_opt_unescape(seq, buf_start); + if (err < 0) + return err; + + s->sm.opt_sec_num = err; + return 0; +} + static int statmount_string(struct kstatmount *s, u64 flag) { - int ret; + int ret = 0; size_t kbufsize; struct seq_file *seq = &s->seq; struct statmount *sm = &s->sm; + u32 start = seq->count; switch (flag) { case STATMOUNT_FS_TYPE: - sm->fs_type = seq->count; + sm->fs_type = start; ret = statmount_fs_type(s, seq); break; case STATMOUNT_MNT_ROOT: - sm->mnt_root = seq->count; + sm->mnt_root = start; ret = statmount_mnt_root(s, seq); break; case STATMOUNT_MNT_POINT: - sm->mnt_point = seq->count; + sm->mnt_point = start; ret = statmount_mnt_point(s, seq); break; case STATMOUNT_MNT_OPTS: - sm->mnt_opts = seq->count; + sm->mnt_opts = start; ret = statmount_mnt_opts(s, seq); break; + case STATMOUNT_OPT_ARRAY: + sm->opt_array = start; + ret = statmount_opt_array(s, seq); + break; + case STATMOUNT_OPT_SEC_ARRAY: + sm->opt_sec_array = start; + ret = statmount_opt_sec_array(s, seq); + break; + case STATMOUNT_FS_SUBTYPE: + sm->fs_subtype = start; + statmount_fs_subtype(s, seq); + break; + case STATMOUNT_SB_SOURCE: + sm->sb_source = start; + ret = statmount_sb_source(s, seq); + break; default: WARN_ON_ONCE(true); return -EINVAL; } + /* + * If nothing was emitted, return to avoid setting the flag + * and terminating the buffer. + */ + if (seq->count == start) + return ret; if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize))) return -EOVERFLOW; if (kbufsize >= s->bufsize) @@ -5203,6 +5336,18 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (!err && s->mask & STATMOUNT_MNT_OPTS) err = statmount_string(s, STATMOUNT_MNT_OPTS); + if (!err && s->mask & STATMOUNT_OPT_ARRAY) + err = statmount_string(s, STATMOUNT_OPT_ARRAY); + + if (!err && s->mask & STATMOUNT_OPT_SEC_ARRAY) + err = statmount_string(s, STATMOUNT_OPT_SEC_ARRAY); + + if (!err && s->mask & STATMOUNT_FS_SUBTYPE) + err = statmount_string(s, STATMOUNT_FS_SUBTYPE); + + if (!err && s->mask & STATMOUNT_SB_SOURCE) + err = statmount_string(s, STATMOUNT_SB_SOURCE); + if (!err && s->mask & STATMOUNT_MNT_NS_ID) statmount_mnt_ns_id(s, ns); @@ -5224,7 +5369,9 @@ static inline bool retry_statmount(const long ret, size_t *seq_size) } #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \ - STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS) + STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \ + STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \ + STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY) static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, struct statmount __user *buf, size_t bufsize, diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 551d2958ec29..d80406f8b568 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -8001,9 +8001,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { case NFS4_READW_LT: - if (nfsd4_has_session(cstate) || - exportfs_lock_op_is_async(sb->s_export_op)) - flags |= FL_SLEEP; fallthrough; case NFS4_READ_LT: spin_lock(&fp->fi_lock); @@ -8014,9 +8011,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, type = F_RDLCK; break; case NFS4_WRITEW_LT: - if (nfsd4_has_session(cstate) || - exportfs_lock_op_is_async(sb->s_export_op)) - flags |= FL_SLEEP; fallthrough; case NFS4_WRITE_LT: spin_lock(&fp->fi_lock); @@ -8036,15 +8030,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - /* - * Most filesystems with their own ->lock operations will block - * the nfsd thread waiting to acquire the lock. That leads to - * deadlocks (we don't want every nfsd thread tied up waiting - * for file locks), so don't attempt blocking lock notifications - * on those filesystems: - */ - if (!exportfs_lock_op_is_async(sb->s_export_op)) - flags &= ~FL_SLEEP; + if (lock->lk_type & (NFS4_READW_LT | NFS4_WRITEW_LT) && + nfsd4_has_session(cstate) && + locks_can_async_lock(nf->nf_file->f_op)) + flags |= FL_SLEEP; nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn); if (!nbl) { diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 96b684763b39..b95724b767e1 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -280,5 +280,4 @@ const struct export_operations ocfs2_export_ops = { .fh_to_dentry = ocfs2_fh_to_dentry, .fh_to_parent = ocfs2_fh_to_parent, .get_parent = ocfs2_get_parent, - .flags = EXPORT_OP_ASYNC_LOCK, }; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 06af21982c16..4fa6c840d20b 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2812,6 +2812,7 @@ const struct file_operations ocfs2_fops = { .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, .remap_file_range = ocfs2_remap_file_range, + .fop_flags = FOP_ASYNC_LOCK, }; WRAP_DIR_ITER(ocfs2_readdir) // FIXME! @@ -2828,6 +2829,7 @@ const struct file_operations ocfs2_dops = { #endif .lock = ocfs2_lock, .flock = ocfs2_flock, + .fop_flags = FOP_ASYNC_LOCK, }; /* diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 6c66a37522d0..4050942ab52f 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -200,11 +200,11 @@ EXPORT_SYMBOL(posix_acl_init); * Allocate a new ACL with the specified number of entries. */ struct posix_acl * -posix_acl_alloc(int count, gfp_t flags) +posix_acl_alloc(unsigned int count, gfp_t flags) { - const size_t size = sizeof(struct posix_acl) + - count * sizeof(struct posix_acl_entry); - struct posix_acl *acl = kmalloc(size, flags); + struct posix_acl *acl; + + acl = kmalloc(struct_size(acl, a_entries, count), flags); if (acl) posix_acl_init(acl, count); return acl; @@ -220,9 +220,8 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags) struct posix_acl *clone = NULL; if (acl) { - int size = sizeof(struct posix_acl) + acl->a_count * - sizeof(struct posix_acl_entry); - clone = kmemdup(acl, size, flags); + clone = kmemdup(acl, struct_size(acl, a_entries, acl->a_count), + flags); if (clone) refcount_set(&clone->a_refcount, 1); } diff --git a/fs/seq_file.c b/fs/seq_file.c index e676c8b0cf5d..8bbb1ad46335 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -343,8 +343,8 @@ EXPORT_SYMBOL(seq_lseek); /** * seq_release - free the structures associated with sequential file. - * @file: file in question * @inode: its inode + * @file: file in question * * Frees the structures associated with sequential file; can be used * as ->f_op->release() if you don't have private data to destroy. |