diff options
Diffstat (limited to 'fs/fuse')
| -rw-r--r-- | fs/fuse/Kconfig | 3 | ||||
| -rw-r--r-- | fs/fuse/Makefile | 5 | ||||
| -rw-r--r-- | fs/fuse/backing.c | 179 | ||||
| -rw-r--r-- | fs/fuse/control.c | 58 | ||||
| -rw-r--r-- | fs/fuse/cuse.c | 3 | ||||
| -rw-r--r-- | fs/fuse/dax.c | 3 | ||||
| -rw-r--r-- | fs/fuse/dev.c | 242 | ||||
| -rw-r--r-- | fs/fuse/dev_uring.c | 35 | ||||
| -rw-r--r-- | fs/fuse/dir.c | 297 | ||||
| -rw-r--r-- | fs/fuse/file.c | 703 | ||||
| -rw-r--r-- | fs/fuse/fuse_dev_i.h | 14 | ||||
| -rw-r--r-- | fs/fuse/fuse_i.h | 115 | ||||
| -rw-r--r-- | fs/fuse/inode.c | 144 | ||||
| -rw-r--r-- | fs/fuse/ioctl.c | 4 | ||||
| -rw-r--r-- | fs/fuse/iomode.c | 3 | ||||
| -rw-r--r-- | fs/fuse/passthrough.c | 162 | ||||
| -rw-r--r-- | fs/fuse/trace.c | 13 | ||||
| -rw-r--r-- | fs/fuse/virtio_fs.c | 25 |
18 files changed, 1214 insertions, 794 deletions
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index ca215a3cba3e..3a4ae632c94a 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -2,6 +2,7 @@ config FUSE_FS tristate "FUSE (Filesystem in Userspace) support" select FS_POSIX_ACL + select FS_IOMAP help With FUSE it is possible to implement a fully functional filesystem in a userspace program. @@ -12,7 +13,7 @@ config FUSE_FS although chances are your distribution already has that library installed if you've installed the "fuse" package itself. - See <file:Documentation/filesystems/fuse.rst> for more information. + See <file:Documentation/filesystems/fuse/fuse.rst> for more information. See <file:Documentation/Changes> for needed library/utility version. If you want to develop a userspace FS, or if you want to use diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 3f0f312a31c1..22ad9538dfc4 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -10,10 +10,11 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o -fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o +fuse-y := trace.o # put trace.o first so we see ftrace errors sooner +fuse-y += dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o -fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o +fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o backing.o fuse-$(CONFIG_SYSCTL) += sysctl.o fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o diff --git a/fs/fuse/backing.c b/fs/fuse/backing.c new file mode 100644 index 000000000000..4afda419dd14 --- /dev/null +++ b/fs/fuse/backing.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE passthrough to backing file. + * + * Copyright (c) 2023 CTERA Networks. + */ + +#include "fuse_i.h" + +#include <linux/file.h> + +struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) +{ + if (fb && refcount_inc_not_zero(&fb->count)) + return fb; + return NULL; +} + +static void fuse_backing_free(struct fuse_backing *fb) +{ + pr_debug("%s: fb=0x%p\n", __func__, fb); + + if (fb->file) + fput(fb->file); + put_cred(fb->cred); + kfree_rcu(fb, rcu); +} + +void fuse_backing_put(struct fuse_backing *fb) +{ + if (fb && refcount_dec_and_test(&fb->count)) + fuse_backing_free(fb); +} + +void fuse_backing_files_init(struct fuse_conn *fc) +{ + idr_init(&fc->backing_files_map); +} + +static int fuse_backing_id_alloc(struct fuse_conn *fc, struct fuse_backing *fb) +{ + int id; + + idr_preload(GFP_KERNEL); + spin_lock(&fc->lock); + /* FIXME: xarray might be space inefficient */ + id = idr_alloc_cyclic(&fc->backing_files_map, fb, 1, 0, GFP_ATOMIC); + spin_unlock(&fc->lock); + idr_preload_end(); + + WARN_ON_ONCE(id == 0); + return id; +} + +static struct fuse_backing *fuse_backing_id_remove(struct fuse_conn *fc, + int id) +{ + struct fuse_backing *fb; + + spin_lock(&fc->lock); + fb = idr_remove(&fc->backing_files_map, id); + spin_unlock(&fc->lock); + + return fb; +} + +static int fuse_backing_id_free(int id, void *p, void *data) +{ + struct fuse_backing *fb = p; + + WARN_ON_ONCE(refcount_read(&fb->count) != 1); + fuse_backing_free(fb); + return 0; +} + +void fuse_backing_files_free(struct fuse_conn *fc) +{ + idr_for_each(&fc->backing_files_map, fuse_backing_id_free, NULL); + idr_destroy(&fc->backing_files_map); +} + +int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map) +{ + struct file *file; + struct super_block *backing_sb; + struct fuse_backing *fb = NULL; + int res; + + pr_debug("%s: fd=%d flags=0x%x\n", __func__, map->fd, map->flags); + + /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ + res = -EPERM; + if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) + goto out; + + res = -EINVAL; + if (map->flags || map->padding) + goto out; + + file = fget_raw(map->fd); + res = -EBADF; + if (!file) + goto out; + + /* read/write/splice/mmap passthrough only relevant for regular files */ + res = d_is_dir(file->f_path.dentry) ? -EISDIR : -EINVAL; + if (!d_is_reg(file->f_path.dentry)) + goto out_fput; + + backing_sb = file_inode(file)->i_sb; + res = -ELOOP; + if (backing_sb->s_stack_depth >= fc->max_stack_depth) + goto out_fput; + + fb = kmalloc(sizeof(struct fuse_backing), GFP_KERNEL); + res = -ENOMEM; + if (!fb) + goto out_fput; + + fb->file = file; + fb->cred = prepare_creds(); + refcount_set(&fb->count, 1); + + res = fuse_backing_id_alloc(fc, fb); + if (res < 0) { + fuse_backing_free(fb); + fb = NULL; + } + +out: + pr_debug("%s: fb=0x%p, ret=%i\n", __func__, fb, res); + + return res; + +out_fput: + fput(file); + goto out; +} + +int fuse_backing_close(struct fuse_conn *fc, int backing_id) +{ + struct fuse_backing *fb = NULL; + int err; + + pr_debug("%s: backing_id=%d\n", __func__, backing_id); + + /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ + err = -EPERM; + if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) + goto out; + + err = -EINVAL; + if (backing_id <= 0) + goto out; + + err = -ENOENT; + fb = fuse_backing_id_remove(fc, backing_id); + if (!fb) + goto out; + + fuse_backing_put(fb); + err = 0; +out: + pr_debug("%s: fb=0x%p, err=%i\n", __func__, fb, err); + + return err; +} + +struct fuse_backing *fuse_backing_lookup(struct fuse_conn *fc, int backing_id) +{ + struct fuse_backing *fb; + + rcu_read_lock(); + fb = idr_find(&fc->backing_files_map, backing_id); + fb = fuse_backing_get(fb); + rcu_read_unlock(); + + return fb; +} diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 2a730d88cc3b..140bd5730d99 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -11,6 +11,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/fs_context.h> +#include <linux/namei.h> #define FUSE_CTL_SUPER_MAGIC 0x65735543 @@ -204,15 +205,13 @@ static const struct file_operations fuse_conn_congestion_threshold_ops = { static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, struct fuse_conn *fc, - const char *name, - int mode, int nlink, + const char *name, int mode, const struct inode_operations *iop, const struct file_operations *fop) { struct dentry *dentry; struct inode *inode; - BUG_ON(fc->ctl_ndents >= FUSE_CTL_NUM_DENTRIES); dentry = d_alloc_name(parent, name); if (!dentry) return NULL; @@ -232,12 +231,19 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, if (iop) inode->i_op = iop; inode->i_fop = fop; - set_nlink(inode, nlink); + if (S_ISDIR(mode)) { + inc_nlink(d_inode(parent)); + inc_nlink(inode); + } inode->i_private = fc; - d_add(dentry, inode); - - fc->ctl_dentry[fc->ctl_ndents++] = dentry; - + d_make_persistent(dentry, inode); + dput(dentry); + + /* + * We are returning a borrowed reference here - it's only good while + * fuse_mutex is held. Actually it's d_make_persistent() return + * value... + */ return dentry; } @@ -254,22 +260,21 @@ int fuse_ctl_add_conn(struct fuse_conn *fc) return 0; parent = fuse_control_sb->s_root; - inc_nlink(d_inode(parent)); sprintf(name, "%u", fc->dev); - parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2, + parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, &simple_dir_inode_operations, &simple_dir_operations); if (!parent) goto err; - if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1, + if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, NULL, &fuse_ctl_waiting_ops) || - !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1, + !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, NULL, &fuse_ctl_abort_ops) || !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600, - 1, NULL, &fuse_conn_max_background_ops) || + NULL, &fuse_conn_max_background_ops) || !fuse_ctl_add_dentry(parent, fc, "congestion_threshold", - S_IFREG | 0600, 1, NULL, + S_IFREG | 0600, NULL, &fuse_conn_congestion_threshold_ops)) goto err; @@ -280,27 +285,24 @@ int fuse_ctl_add_conn(struct fuse_conn *fc) return -ENOMEM; } +static void remove_one(struct dentry *dentry) +{ + d_inode(dentry)->i_private = NULL; +} + /* * Remove a connection from the control filesystem (if it exists). * Caller must hold fuse_mutex */ void fuse_ctl_remove_conn(struct fuse_conn *fc) { - int i; + char name[32]; if (!fuse_control_sb || fc->no_control) return; - for (i = fc->ctl_ndents - 1; i >= 0; i--) { - struct dentry *dentry = fc->ctl_dentry[i]; - d_inode(dentry)->i_private = NULL; - if (!i) { - /* Get rid of submounts: */ - d_invalidate(dentry); - } - dput(dentry); - } - drop_nlink(d_inode(fuse_control_sb->s_root)); + sprintf(name, "%u", fc->dev); + simple_remove_by_name(fuse_control_sb->s_root, name, remove_one); } static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fsc) @@ -346,15 +348,11 @@ static int fuse_ctl_init_fs_context(struct fs_context *fsc) static void fuse_ctl_kill_sb(struct super_block *sb) { - struct fuse_conn *fc; - mutex_lock(&fuse_mutex); fuse_control_sb = NULL; - list_for_each_entry(fc, &fuse_conn_list, entry) - fc->ctl_ndents = 0; mutex_unlock(&fuse_mutex); - kill_litter_super(sb); + kill_anon_super(sb); } static struct file_system_type fuse_ctl_fs_type = { diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b39844d75a80..28c96961e85d 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -52,6 +52,7 @@ #include <linux/user_namespace.h> #include "fuse_i.h" +#include "fuse_dev_i.h" #define CUSE_CONNTBL_LEN 64 @@ -547,7 +548,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file) */ static int cuse_channel_release(struct inode *inode, struct file *file) { - struct fuse_dev *fud = file->private_data; + struct fuse_dev *fud = __fuse_get_dev(file); struct cuse_conn *cc = fc_to_cc(fud->fc); /* remove from the conntbl, no more access from this point on */ diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 0502bf3cdf6a..ac6d4c1064cc 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -10,7 +10,6 @@ #include <linux/dax.h> #include <linux/uio.h> #include <linux/pagemap.h> -#include <linux/pfn_t.h> #include <linux/iomap.h> #include <linux/interval_tree.h> @@ -757,7 +756,7 @@ static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, unsigned int order, vm_fault_t ret; struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; - pfn_t pfn; + unsigned long pfn; int error = 0; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_conn_dax *fcd = fc->dax; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index e80cd8f2c049..6d59cbc877c6 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -25,7 +25,6 @@ #include <linux/sched.h> #include <linux/seq_file.h> -#define CREATE_TRACE_POINTS #include "fuse_trace.h" MODULE_ALIAS_MISCDEV(FUSE_MINOR); @@ -119,7 +118,7 @@ void fuse_check_timeout(struct work_struct *work) goto abort_conn; out: - queue_delayed_work(system_wq, &fc->timeout.work, + queue_delayed_work(system_percpu_wq, &fc->timeout.work, fuse_timeout_timer_freq); return; @@ -207,8 +206,9 @@ static struct fuse_req *fuse_get_req(struct mnt_idmap *idmap, if (fuse_block_alloc(fc, for_background)) { err = -EINTR; - if (wait_event_killable_exclusive(fc->blocked_waitq, - !fuse_block_alloc(fc, for_background))) + if (wait_event_state_exclusive(fc->blocked_waitq, + !fuse_block_alloc(fc, for_background), + (TASK_KILLABLE | TASK_FREEZABLE))) goto out; } /* Matches smp_wmb() in fuse_set_initialized() */ @@ -322,6 +322,7 @@ unsigned int fuse_req_hash(u64 unique) { return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS); } +EXPORT_SYMBOL_GPL(fuse_req_hash); /* * A new request is available, wake fiq->waitq @@ -369,12 +370,32 @@ void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) } } +static inline void fuse_request_assign_unique_locked(struct fuse_iqueue *fiq, + struct fuse_req *req) +{ + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique_locked(fiq); + + /* tracepoint captures in.h.unique and in.h.len */ + trace_fuse_request_send(req); +} + +inline void fuse_request_assign_unique(struct fuse_iqueue *fiq, + struct fuse_req *req) +{ + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique(fiq); + + /* tracepoint captures in.h.unique and in.h.len */ + trace_fuse_request_send(req); +} +EXPORT_SYMBOL_GPL(fuse_request_assign_unique); + static void fuse_dev_queue_req(struct fuse_iqueue *fiq, struct fuse_req *req) { spin_lock(&fiq->lock); if (fiq->connected) { - if (req->in.h.opcode != FUSE_NOTIFY_REPLY) - req->in.h.unique = fuse_get_unique_locked(fiq); + fuse_request_assign_unique_locked(fiq, req); list_add_tail(&req->list, &fiq->pending); fuse_dev_wake_and_unlock(fiq); } else { @@ -397,7 +418,6 @@ static void fuse_send_one(struct fuse_iqueue *fiq, struct fuse_req *req) req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); - trace_fuse_request_send(req); fiq->ops->send_req(fiq, req); } @@ -687,10 +707,10 @@ static bool fuse_request_queue_background_uring(struct fuse_conn *fc, { struct fuse_iqueue *fiq = &fc->iq; - req->in.h.unique = fuse_get_unique(fiq); req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); + fuse_request_assign_unique(fiq, req); return fuse_uring_queue_bq_req(req); } @@ -826,7 +846,7 @@ void fuse_copy_init(struct fuse_copy_state *cs, bool write, } /* Unmap and put previous page of userspace buffer */ -static void fuse_copy_finish(struct fuse_copy_state *cs) +void fuse_copy_finish(struct fuse_copy_state *cs) { if (cs->currbuf) { struct pipe_buffer *buf = cs->currbuf; @@ -935,7 +955,7 @@ static int fuse_check_folio(struct folio *folio) { if (folio_mapped(folio) || folio->mapping != NULL || - (folio->flags & PAGE_FLAGS_CHECK_AT_PREP & + (folio->flags.f & PAGE_FLAGS_CHECK_AT_PREP & ~(1 << PG_locked | 1 << PG_referenced | 1 << PG_lru | @@ -1528,14 +1548,34 @@ static int fuse_dev_open(struct inode *inode, struct file *file) return 0; } +struct fuse_dev *fuse_get_dev(struct file *file) +{ + struct fuse_dev *fud = __fuse_get_dev(file); + int err; + + if (likely(fud)) + return fud; + + err = wait_event_interruptible(fuse_dev_waitq, + READ_ONCE(file->private_data) != FUSE_DEV_SYNC_INIT); + if (err) + return ERR_PTR(err); + + fud = __fuse_get_dev(file); + if (!fud) + return ERR_PTR(-EPERM); + + return fud; +} + static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to) { struct fuse_copy_state cs; struct file *file = iocb->ki_filp; struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!user_backed_iter(to)) return -EINVAL; @@ -1555,8 +1595,8 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, struct fuse_copy_state cs; struct fuse_dev *fud = fuse_get_dev(in); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); bufs = kvmalloc_array(pipe->max_usage, sizeof(struct pipe_buffer), GFP_KERNEL); @@ -1600,35 +1640,31 @@ static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_poll_wakeup_out outarg; - int err = -EINVAL; + int err; if (size != sizeof(outarg)) - goto err; + return -EINVAL; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - goto err; + return err; fuse_copy_finish(cs); return fuse_notify_poll_wakeup(fc, &outarg); - -err: - fuse_copy_finish(cs); - return err; } static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_inval_inode_out outarg; - int err = -EINVAL; + int err; if (size != sizeof(outarg)) - goto err; + return -EINVAL; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - goto err; + return err; fuse_copy_finish(cs); down_read(&fc->killsb); @@ -1636,10 +1672,6 @@ static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size, outarg.off, outarg.len); up_read(&fc->killsb); return err; - -err: - fuse_copy_finish(cs); - return err; } static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, @@ -1647,29 +1679,26 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, { struct fuse_notify_inval_entry_out outarg; int err; - char *buf = NULL; + char *buf; struct qstr name; - err = -EINVAL; if (size < sizeof(outarg)) - goto err; + return -EINVAL; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - goto err; + return err; - err = -ENAMETOOLONG; if (outarg.namelen > fc->name_max) - goto err; + return -ENAMETOOLONG; err = -EINVAL; if (size != sizeof(outarg) + outarg.namelen + 1) - goto err; + return -EINVAL; - err = -ENOMEM; buf = kzalloc(outarg.namelen + 1, GFP_KERNEL); if (!buf) - goto err; + return -ENOMEM; name.name = buf; name.len = outarg.namelen; @@ -1682,12 +1711,8 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, down_read(&fc->killsb); err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name, outarg.flags); up_read(&fc->killsb); - kfree(buf); - return err; - err: kfree(buf); - fuse_copy_finish(cs); return err; } @@ -1696,29 +1721,25 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, { struct fuse_notify_delete_out outarg; int err; - char *buf = NULL; + char *buf; struct qstr name; - err = -EINVAL; if (size < sizeof(outarg)) - goto err; + return -EINVAL; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - goto err; + return err; - err = -ENAMETOOLONG; if (outarg.namelen > fc->name_max) - goto err; + return -ENAMETOOLONG; - err = -EINVAL; if (size != sizeof(outarg) + outarg.namelen + 1) - goto err; + return -EINVAL; - err = -ENOMEM; buf = kzalloc(outarg.namelen + 1, GFP_KERNEL); if (!buf) - goto err; + return -ENOMEM; name.name = buf; name.len = outarg.namelen; @@ -1731,12 +1752,8 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, down_read(&fc->killsb); err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name, 0); up_read(&fc->killsb); - kfree(buf); - return err; - err: kfree(buf); - fuse_copy_finish(cs); return err; } @@ -1754,17 +1771,15 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, loff_t file_size; loff_t end; - err = -EINVAL; if (size < sizeof(outarg)) - goto out_finish; + return -EINVAL; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - goto out_finish; + return err; - err = -EINVAL; if (size - sizeof(outarg) != outarg.size) - goto out_finish; + return -EINVAL; nodeid = outarg.nodeid; @@ -1824,8 +1839,6 @@ out_iput: iput(inode); out_up_killsb: up_read(&fc->killsb); -out_finish: - fuse_copy_finish(cs); return err; } @@ -1893,7 +1906,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, index = outarg->offset >> PAGE_SHIFT; - while (num) { + while (num && ap->num_folios < num_pages) { struct folio *folio; unsigned int folio_offset; unsigned int nr_bytes; @@ -1940,13 +1953,12 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, u64 nodeid; int err; - err = -EINVAL; if (size != sizeof(outarg)) - goto copy_finish; + return -EINVAL; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - goto copy_finish; + return err; fuse_copy_finish(cs); @@ -1962,10 +1974,6 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, up_read(&fc->killsb); return err; - -copy_finish: - fuse_copy_finish(cs); - return err; } /* @@ -2033,17 +2041,54 @@ static int fuse_notify_resend(struct fuse_conn *fc) /* * Increments the fuse connection epoch. This will result of dentries from - * previous epochs to be invalidated. - * - * XXX optimization: add call to shrink_dcache_sb()? + * previous epochs to be invalidated. Additionally, if inval_wq is set, a work + * queue is scheduled to trigger the invalidation. */ static int fuse_notify_inc_epoch(struct fuse_conn *fc) { atomic_inc(&fc->epoch); + if (inval_wq) + schedule_work(&fc->epoch_work); return 0; } +static int fuse_notify_prune(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_prune_out outarg; + const unsigned int batch = 512; + u64 *nodeids __free(kfree) = kmalloc(sizeof(u64) * batch, GFP_KERNEL); + unsigned int num, i; + int err; + + if (!nodeids) + return -ENOMEM; + + if (size < sizeof(outarg)) + return -EINVAL; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + return err; + + if (size - sizeof(outarg) != outarg.count * sizeof(u64)) + return -EINVAL; + + for (; outarg.count; outarg.count -= num) { + num = min(batch, outarg.count); + err = fuse_copy_one(cs, nodeids, num * sizeof(u64)); + if (err) + return err; + + scoped_guard(rwsem_read, &fc->killsb) { + for (i = 0; i < num; i++) + fuse_try_prune_one_inode(fc, nodeids[i]); + } + } + return 0; +} + static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { @@ -2075,8 +2120,10 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_INC_EPOCH: return fuse_notify_inc_epoch(fc); + case FUSE_NOTIFY_PRUNE: + return fuse_notify_prune(fc, size, cs); + default: - fuse_copy_finish(cs); return -EINVAL; } } @@ -2156,7 +2203,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, */ if (!oh.unique) { err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs); - goto out; + goto copy_finish; } err = -EINVAL; @@ -2229,7 +2276,7 @@ copy_finish: static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) { struct fuse_copy_state cs; - struct fuse_dev *fud = fuse_get_dev(iocb->ki_filp); + struct fuse_dev *fud = __fuse_get_dev(iocb->ki_filp); if (!fud) return -EPERM; @@ -2251,11 +2298,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, unsigned idx; struct pipe_buffer *bufs; struct fuse_copy_state cs; - struct fuse_dev *fud; + struct fuse_dev *fud = __fuse_get_dev(out); size_t rem; ssize_t ret; - fud = fuse_get_dev(out); if (!fud) return -EPERM; @@ -2341,7 +2387,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) struct fuse_iqueue *fiq; struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) + if (IS_ERR(fud)) return EPOLLERR; fiq = &fud->fc->iq; @@ -2394,7 +2440,7 @@ static void end_polls(struct fuse_conn *fc) * The same effect is usually achievable through killing the filesystem daemon * and all users of the filesystem. The exception is the combination of an * asynchronous request and the tricky deadlock (see - * Documentation/filesystems/fuse.rst). + * Documentation/filesystems/fuse/fuse.rst). * * Aborting requests under I/O goes as follows: 1: Separate out unlocked * requests, they should be finished off immediately. Locked requests will be @@ -2488,7 +2534,7 @@ void fuse_wait_aborted(struct fuse_conn *fc) int fuse_dev_release(struct inode *inode, struct file *file) { - struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_dev *fud = __fuse_get_dev(file); if (fud) { struct fuse_conn *fc = fud->fc; @@ -2519,8 +2565,8 @@ static int fuse_dev_fasync(int fd, struct file *file, int on) { struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); /* No locking - fasync_helper does its own locking */ return fasync_helper(fd, file, on, &fud->fc->iq.fasync); @@ -2530,7 +2576,7 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) { struct fuse_dev *fud; - if (new->private_data) + if (__fuse_get_dev(new)) return -EINVAL; fud = fuse_dev_alloc_install(fc); @@ -2561,7 +2607,7 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) * uses the same ioctl handler. */ if (fd_file(f)->f_op == file->f_op) - fud = fuse_get_dev(fd_file(f)); + fud = __fuse_get_dev(fd_file(f)); res = -EINVAL; if (fud) { @@ -2579,8 +2625,8 @@ static long fuse_dev_ioctl_backing_open(struct file *file, struct fuse_dev *fud = fuse_get_dev(file); struct fuse_backing_map map; - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) return -EOPNOTSUPP; @@ -2596,8 +2642,8 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) struct fuse_dev *fud = fuse_get_dev(file); int backing_id; - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) return -EOPNOTSUPP; @@ -2608,6 +2654,19 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) return fuse_backing_close(fud->fc, backing_id); } +static long fuse_dev_ioctl_sync_init(struct file *file) +{ + int err = -EINVAL; + + mutex_lock(&fuse_mutex); + if (!__fuse_get_dev(file)) { + WRITE_ONCE(file->private_data, FUSE_DEV_SYNC_INIT); + err = 0; + } + mutex_unlock(&fuse_mutex); + return err; +} + static long fuse_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2623,6 +2682,9 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, case FUSE_DEV_IOC_BACKING_CLOSE: return fuse_dev_ioctl_backing_close(file, argp); + case FUSE_DEV_IOC_SYNC_INIT: + return fuse_dev_ioctl_sync_init(file); + default: return -ENOTTY; } @@ -2631,7 +2693,7 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, #ifdef CONFIG_PROC_FS static void fuse_dev_show_fdinfo(struct seq_file *seq, struct file *file) { - struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_dev *fud = __fuse_get_dev(file); if (!fud) return; diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 249b210becb1..5ceb217ced1b 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -7,6 +7,7 @@ #include "fuse_i.h" #include "dev_uring_i.h" #include "fuse_dev_i.h" +#include "fuse_trace.h" #include <linux/fs.h> #include <linux/io_uring/cmd.h> @@ -85,6 +86,7 @@ static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, lockdep_assert_not_held(&queue->lock); spin_lock(&queue->lock); ent->fuse_req = NULL; + list_del_init(&req->list); if (test_bit(FR_BACKGROUND, &req->flags)) { queue->active_background--; spin_lock(&fc->bg_lock); @@ -351,7 +353,7 @@ static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) spin_unlock(&queue->lock); if (cmd) - io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED); + io_uring_cmd_done(cmd, -ENOTCONN, IO_URING_F_UNLOCKED); if (req) fuse_uring_stop_fuse_req_end(req); @@ -518,7 +520,7 @@ static void fuse_uring_cancel(struct io_uring_cmd *cmd, if (need_cmd_done) { /* no queue lock to avoid lock order issues */ - io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags); + io_uring_cmd_done(cmd, -ENOTCONN, issue_flags); } } @@ -597,12 +599,14 @@ static int fuse_uring_copy_from_ring(struct fuse_ring *ring, cs.is_uring = true; cs.req = req; - return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); + err = fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); + fuse_copy_finish(&cs); + return err; } - /* - * Copy data from the req to the ring buffer - */ +/* + * Copy data from the req to the ring buffer + */ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, struct fuse_ring_ent *ent) { @@ -648,6 +652,7 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, /* copy the payload */ err = fuse_copy_args(&cs, num_args, args->in_pages, (struct fuse_arg *)in_args, 0); + fuse_copy_finish(&cs); if (err) { pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); return err; @@ -733,7 +738,7 @@ static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent, list_move_tail(&ent->list, &queue->ent_in_userspace); spin_unlock(&queue->lock); - io_uring_cmd_done(cmd, 0, 0, issue_flags); + io_uring_cmd_done(cmd, 0, issue_flags); return 0; } @@ -1139,9 +1144,9 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return -EINVAL; fud = fuse_get_dev(cmd->file); - if (!fud) { + if (IS_ERR(fud)) { pr_info_ratelimited("No fuse device found\n"); - return -ENOTCONN; + return PTR_ERR(fud); } fc = fud->fc; @@ -1200,7 +1205,7 @@ static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, ent->cmd = NULL; spin_unlock(&queue->lock); - io_uring_cmd_done(cmd, ret, 0, issue_flags); + io_uring_cmd_done(cmd, ret, issue_flags); } /* @@ -1208,14 +1213,15 @@ static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, * User buffers are not mapped yet - the application does not have permission * to write to it - this has to be executed in ring task context. */ -static void fuse_uring_send_in_task(struct io_uring_cmd *cmd, - unsigned int issue_flags) +static void fuse_uring_send_in_task(struct io_tw_req tw_req, io_tw_token_t tw) { + unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); struct fuse_ring_queue *queue = ent->queue; int err; - if (!(issue_flags & IO_URING_F_TASK_DEAD)) { + if (!tw.cancel) { err = fuse_uring_prepare_send(ent, ent->fuse_req); if (err) { fuse_uring_next_fuse_req(ent, queue, issue_flags); @@ -1268,8 +1274,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) if (!queue) goto err; - if (req->in.h.opcode != FUSE_NOTIFY_REPLY) - req->in.h.unique = fuse_get_unique(fiq); + fuse_request_assign_unique(fiq, req); spin_lock(&queue->lock); err = -ENOTCONN; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 45b4c3cc1396..4b6b3d2758ff 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -27,6 +27,67 @@ module_param(allow_sys_admin_access, bool, 0644); MODULE_PARM_DESC(allow_sys_admin_access, "Allow users with CAP_SYS_ADMIN in initial userns to bypass allow_other access check"); +struct dentry_bucket { + struct rb_root tree; + spinlock_t lock; +}; + +#define HASH_BITS 5 +#define HASH_SIZE (1 << HASH_BITS) +static struct dentry_bucket dentry_hash[HASH_SIZE]; +struct delayed_work dentry_tree_work; + +/* Minimum invalidation work queue frequency */ +#define FUSE_DENTRY_INVAL_FREQ_MIN 5 + +unsigned __read_mostly inval_wq; +static int inval_wq_set(const char *val, const struct kernel_param *kp) +{ + unsigned int num; + unsigned int old = inval_wq; + int ret; + + if (!val) + return -EINVAL; + + ret = kstrtouint(val, 0, &num); + if (ret) + return ret; + + if ((num < FUSE_DENTRY_INVAL_FREQ_MIN) && (num != 0)) + return -EINVAL; + + /* This should prevent overflow in secs_to_jiffies() */ + if (num > USHRT_MAX) + return -EINVAL; + + *((unsigned int *)kp->arg) = num; + + if (num && !old) + schedule_delayed_work(&dentry_tree_work, + secs_to_jiffies(num)); + else if (!num && old) + cancel_delayed_work_sync(&dentry_tree_work); + + return 0; +} +static const struct kernel_param_ops inval_wq_ops = { + .set = inval_wq_set, + .get = param_get_uint, +}; +module_param_cb(inval_wq, &inval_wq_ops, &inval_wq, 0644); +__MODULE_PARM_TYPE(inval_wq, "uint"); +MODULE_PARM_DESC(inval_wq, + "Dentries invalidation work queue period in secs (>= " + __stringify(FUSE_DENTRY_INVAL_FREQ_MIN) ")."); + +static inline struct dentry_bucket *get_dentry_bucket(struct dentry *dentry) +{ + int i = hash_ptr(dentry, HASH_BITS); + + return &dentry_hash[i]; +} + static void fuse_advise_use_readdirplus(struct inode *dir) { struct fuse_inode *fi = get_fuse_inode(dir); @@ -34,33 +95,151 @@ static void fuse_advise_use_readdirplus(struct inode *dir) set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state); } -#if BITS_PER_LONG >= 64 -static inline void __fuse_dentry_settime(struct dentry *entry, u64 time) +struct fuse_dentry { + u64 time; + union { + struct rcu_head rcu; + struct rb_node node; + }; + struct dentry *dentry; +}; + +static void __fuse_dentry_tree_del_node(struct fuse_dentry *fd, + struct dentry_bucket *bucket) { - entry->d_fsdata = (void *) time; + if (!RB_EMPTY_NODE(&fd->node)) { + rb_erase(&fd->node, &bucket->tree); + RB_CLEAR_NODE(&fd->node); + } } -static inline u64 fuse_dentry_time(const struct dentry *entry) +static void fuse_dentry_tree_del_node(struct dentry *dentry) { - return (u64)entry->d_fsdata; + struct fuse_dentry *fd = dentry->d_fsdata; + struct dentry_bucket *bucket = get_dentry_bucket(dentry); + + spin_lock(&bucket->lock); + __fuse_dentry_tree_del_node(fd, bucket); + spin_unlock(&bucket->lock); } -#else -union fuse_dentry { - u64 time; - struct rcu_head rcu; -}; +static void fuse_dentry_tree_add_node(struct dentry *dentry) +{ + struct fuse_dentry *fd = dentry->d_fsdata; + struct dentry_bucket *bucket; + struct fuse_dentry *cur; + struct rb_node **p, *parent = NULL; + + if (!inval_wq) + return; + + bucket = get_dentry_bucket(dentry); + + spin_lock(&bucket->lock); + + __fuse_dentry_tree_del_node(fd, bucket); + + p = &bucket->tree.rb_node; + while (*p) { + parent = *p; + cur = rb_entry(*p, struct fuse_dentry, node); + if (fd->time < cur->time) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + rb_link_node(&fd->node, parent, p); + rb_insert_color(&fd->node, &bucket->tree); + spin_unlock(&bucket->lock); +} + +/* + * work queue which, when enabled, will periodically check for expired dentries + * in the dentries tree. + */ +static void fuse_dentry_tree_work(struct work_struct *work) +{ + LIST_HEAD(dispose); + struct fuse_dentry *fd; + struct rb_node *node; + int i; + + for (i = 0; i < HASH_SIZE; i++) { + spin_lock(&dentry_hash[i].lock); + node = rb_first(&dentry_hash[i].tree); + while (node) { + fd = rb_entry(node, struct fuse_dentry, node); + if (time_after64(get_jiffies_64(), fd->time)) { + rb_erase(&fd->node, &dentry_hash[i].tree); + RB_CLEAR_NODE(&fd->node); + spin_unlock(&dentry_hash[i].lock); + d_dispose_if_unused(fd->dentry, &dispose); + cond_resched(); + spin_lock(&dentry_hash[i].lock); + } else + break; + node = rb_first(&dentry_hash[i].tree); + } + spin_unlock(&dentry_hash[i].lock); + shrink_dentry_list(&dispose); + } + + if (inval_wq) + schedule_delayed_work(&dentry_tree_work, + secs_to_jiffies(inval_wq)); +} + +void fuse_epoch_work(struct work_struct *work) +{ + struct fuse_conn *fc = container_of(work, struct fuse_conn, + epoch_work); + struct fuse_mount *fm; + struct inode *inode; + + down_read(&fc->killsb); + + inode = fuse_ilookup(fc, FUSE_ROOT_ID, &fm); + if (inode) { + iput(inode); + /* Remove all possible active references to cached inodes */ + shrink_dcache_sb(fm->sb); + } else + pr_warn("Failed to get root inode"); + + up_read(&fc->killsb); +} + +void fuse_dentry_tree_init(void) +{ + int i; + + for (i = 0; i < HASH_SIZE; i++) { + spin_lock_init(&dentry_hash[i].lock); + dentry_hash[i].tree = RB_ROOT; + } + INIT_DELAYED_WORK(&dentry_tree_work, fuse_dentry_tree_work); +} + +void fuse_dentry_tree_cleanup(void) +{ + int i; + + inval_wq = 0; + cancel_delayed_work_sync(&dentry_tree_work); + + for (i = 0; i < HASH_SIZE; i++) + WARN_ON_ONCE(!RB_EMPTY_ROOT(&dentry_hash[i].tree)); +} static inline void __fuse_dentry_settime(struct dentry *dentry, u64 time) { - ((union fuse_dentry *) dentry->d_fsdata)->time = time; + ((struct fuse_dentry *) dentry->d_fsdata)->time = time; } static inline u64 fuse_dentry_time(const struct dentry *entry) { - return ((union fuse_dentry *) entry->d_fsdata)->time; + return ((struct fuse_dentry *) entry->d_fsdata)->time; } -#endif static void fuse_dentry_settime(struct dentry *dentry, u64 time) { @@ -81,6 +260,7 @@ static void fuse_dentry_settime(struct dentry *dentry, u64 time) } __fuse_dentry_settime(dentry, time); + fuse_dentry_tree_add_node(dentry); } /* @@ -283,21 +463,36 @@ invalid: goto out; } -#if BITS_PER_LONG < 64 static int fuse_dentry_init(struct dentry *dentry) { - dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), - GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); + struct fuse_dentry *fd; + + fd = kzalloc(sizeof(struct fuse_dentry), + GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); + if (!fd) + return -ENOMEM; + + fd->dentry = dentry; + RB_CLEAR_NODE(&fd->node); + dentry->d_fsdata = fd; + + return 0; +} + +static void fuse_dentry_prune(struct dentry *dentry) +{ + struct fuse_dentry *fd = dentry->d_fsdata; - return dentry->d_fsdata ? 0 : -ENOMEM; + if (!RB_EMPTY_NODE(&fd->node)) + fuse_dentry_tree_del_node(dentry); } + static void fuse_dentry_release(struct dentry *dentry) { - union fuse_dentry *fd = dentry->d_fsdata; + struct fuse_dentry *fd = dentry->d_fsdata; kfree_rcu(fd, rcu); } -#endif static int fuse_dentry_delete(const struct dentry *dentry) { @@ -331,20 +526,12 @@ static struct vfsmount *fuse_dentry_automount(struct path *path) const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, .d_delete = fuse_dentry_delete, -#if BITS_PER_LONG < 64 .d_init = fuse_dentry_init, + .d_prune = fuse_dentry_prune, .d_release = fuse_dentry_release, -#endif .d_automount = fuse_dentry_automount, }; -const struct dentry_operations fuse_root_dentry_operations = { -#if BITS_PER_LONG < 64 - .d_init = fuse_dentry_init, - .d_release = fuse_dentry_release, -#endif -}; - int fuse_valid_type(int m) { return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) || @@ -478,7 +665,7 @@ static int get_security_context(struct dentry *entry, umode_t mode, u32 total_len = sizeof(*header); int err, nr_ctx = 0; const char *name = NULL; - size_t namelen; + size_t namesize; err = security_dentry_init_security(entry, mode, &entry->d_name, &name, &lsmctx); @@ -489,12 +676,12 @@ static int get_security_context(struct dentry *entry, umode_t mode, if (lsmctx.len) { nr_ctx = 1; - namelen = strlen(name) + 1; + namesize = strlen(name) + 1; err = -EIO; - if (WARN_ON(namelen > XATTR_NAME_MAX + 1 || + if (WARN_ON(namesize > XATTR_NAME_MAX + 1 || lsmctx.len > S32_MAX)) goto out_err; - total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namelen + + total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namesize + lsmctx.len); } @@ -511,8 +698,8 @@ static int get_security_context(struct dentry *entry, umode_t mode, fctx->size = lsmctx.len; ptr += sizeof(*fctx); - strcpy(ptr, name); - ptr += namelen; + strscpy(ptr, name, namesize); + ptr += namesize; memcpy(ptr, lsmctx.context, lsmctx.len); } @@ -746,22 +933,18 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, int err; struct mnt_idmap *idmap = file_mnt_idmap(file); struct fuse_conn *fc = get_fuse_conn(dir); - struct dentry *res = NULL; if (fuse_is_bad(dir)) return -EIO; if (d_in_lookup(entry)) { - res = fuse_lookup(dir, entry, 0); - if (IS_ERR(res)) - return PTR_ERR(res); - - if (res) - entry = res; + struct dentry *res = fuse_lookup(dir, entry, 0); + if (res || d_really_is_positive(entry)) + return finish_no_open(file, res); } - if (!(flags & O_CREAT) || d_really_is_positive(entry)) - goto no_open; + if (!(flags & O_CREAT)) + return finish_no_open(file, NULL); /* Only creates */ file->f_mode |= FMODE_CREATED; @@ -775,16 +958,13 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, goto mknod; } else if (err == -EEXIST) fuse_invalidate_entry(entry); -out_dput: - dput(res); return err; mknod: err = fuse_mknod(idmap, dir, entry, mode, 0); if (err) - goto out_dput; -no_open: - return finish_no_open(file, res); + return err; + return finish_no_open(file, NULL); } /* @@ -1384,6 +1564,7 @@ retry: generic_fillattr(idmap, request_mask, inode, stat); stat->mode = fi->orig_i_mode; stat->ino = fi->orig_ino; + stat->blksize = 1 << fi->cached_i_blkbits; if (test_bit(FUSE_I_BTIME, &fi->state)) { stat->btime = fi->i_btime; stat->result_mask |= STATX_BTIME; @@ -1410,27 +1591,25 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, if (!parent) return -ENOENT; - inode_lock_nested(parent, I_MUTEX_PARENT); if (!S_ISDIR(parent->i_mode)) - goto unlock; + goto put_parent; err = -ENOENT; dir = d_find_alias(parent); if (!dir) - goto unlock; + goto put_parent; - name->hash = full_name_hash(dir, name->name, name->len); - entry = d_lookup(dir, name); + entry = start_removing_noperm(dir, name); dput(dir); - if (!entry) - goto unlock; + if (IS_ERR(entry)) + goto put_parent; fuse_dir_changed(parent); if (!(flags & FUSE_EXPIRE_ONLY)) d_invalidate(entry); fuse_invalidate_entry_cache(entry); - if (child_nodeid != 0 && d_really_is_positive(entry)) { + if (child_nodeid != 0) { inode_lock(d_inode(entry)); if (get_node_id(d_inode(entry)) != child_nodeid) { err = -ENOENT; @@ -1458,10 +1637,9 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, } else { err = 0; } - dput(entry); - unlock: - inode_unlock(parent); + end_removing(entry); + put_parent: iput(parent); return err; } @@ -2243,6 +2421,7 @@ static const struct file_operations fuse_dir_operations = { .fsync = fuse_dir_fsync, .unlocked_ioctl = fuse_dir_ioctl, .compat_ioctl = fuse_dir_compat_ioctl, + .setlease = simple_nosetlease, }; static const struct inode_operations fuse_common_inode_operations = { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f102afc03359..01bc894e9c2b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -21,6 +21,7 @@ #include <linux/filelock.h> #include <linux/splice.h> #include <linux/task_io_accounting_ops.h> +#include <linux/iomap.h> static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, int opcode, @@ -109,7 +110,9 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) fuse_file_io_release(ff, ra->inode); if (!args) { - /* Do nothing when server does not implement 'open' */ + /* Do nothing when server does not implement 'opendir' */ + } else if (args->opcode == FUSE_RELEASE && ff->fm->fc->no_open) { + fuse_release_end(ff->fm, args, 0); } else if (sync) { fuse_simple_request(ff->fm, args); fuse_release_end(ff->fm, args, 0); @@ -130,8 +133,17 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, struct fuse_file *ff; int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; bool open = isdir ? !fc->no_opendir : !fc->no_open; + bool release = !isdir || open; - ff = fuse_file_alloc(fm, open); + /* + * ff->args->release_args still needs to be allocated (so we can hold an + * inode reference while there are pending inflight file operations when + * ->release() is called, see fuse_prepare_release()) even if + * fc->no_open is set else it becomes possible for reclaim to deadlock + * if while servicing the readahead request the server triggers reclaim + * and reclaim evicts the inode of the file being read ahead. + */ + ff = fuse_file_alloc(fm, release); if (!ff) return ERR_PTR(-ENOMEM); @@ -151,13 +163,14 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, fuse_file_free(ff); return ERR_PTR(err); } else { - /* No release needed */ - kfree(ff->args); - ff->args = NULL; - if (isdir) + if (isdir) { + /* No release needed */ + kfree(ff->args); + ff->args = NULL; fc->no_opendir = 1; - else + } else { fc->no_open = 1; + } } } @@ -355,8 +368,14 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff, * Make the release synchronous if this is a fuseblk mount, * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. + * + * Always use the asynchronous file put because the current thread + * might be the fuse server. This can happen if a process starts some + * aio and closes the fd before the aio completes. Since aio takes its + * own ref to the file, the IO completion has to drop the ref, which is + * how the fuse server can end up closing its clients' files. */ - fuse_file_put(ff, ff->fm->fc->destroy); + fuse_file_put(ff, false); } void fuse_release_common(struct file *file, bool isdir) @@ -788,12 +807,16 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, } } -static int fuse_do_readfolio(struct file *file, struct folio *folio) +static int fuse_do_readfolio(struct file *file, struct folio *folio, + size_t off, size_t len) { struct inode *inode = folio->mapping->host; struct fuse_mount *fm = get_fuse_mount(inode); - loff_t pos = folio_pos(folio); - struct fuse_folio_desc desc = { .length = folio_size(folio) }; + loff_t pos = folio_pos(folio) + off; + struct fuse_folio_desc desc = { + .offset = off, + .length = len, + }; struct fuse_io_args ia = { .ap.args.page_zeroing = true, .ap.args.out_pages = true, @@ -820,25 +843,155 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio) if (res < desc.length) fuse_short_read(inode, attr_ver, res, &ia.ap); - folio_mark_uptodate(folio); + return 0; +} + +static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + iomap->type = IOMAP_MAPPED; + iomap->length = length; + iomap->offset = offset; + return 0; +} + +static const struct iomap_ops fuse_iomap_ops = { + .iomap_begin = fuse_iomap_begin, +}; + +struct fuse_fill_read_data { + struct file *file; + + /* Fields below are used if sending the read request asynchronously */ + struct fuse_conn *fc; + struct fuse_io_args *ia; + unsigned int nr_bytes; +}; + +/* forward declarations */ +static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos, + unsigned len, struct fuse_args_pages *ap, + unsigned cur_bytes, bool write); +static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, + unsigned int count, bool async); + +static int fuse_handle_readahead(struct folio *folio, + struct readahead_control *rac, + struct fuse_fill_read_data *data, loff_t pos, + size_t len) +{ + struct fuse_io_args *ia = data->ia; + size_t off = offset_in_folio(folio, pos); + struct fuse_conn *fc = data->fc; + struct fuse_args_pages *ap; + unsigned int nr_pages; + + if (ia && fuse_folios_need_send(fc, pos, len, &ia->ap, data->nr_bytes, + false)) { + fuse_send_readpages(ia, data->file, data->nr_bytes, + fc->async_read); + data->nr_bytes = 0; + data->ia = NULL; + ia = NULL; + } + if (!ia) { + if (fc->num_background >= fc->congestion_threshold && + rac->ra->async_size >= readahead_count(rac)) + /* + * Congested and only async pages left, so skip the + * rest. + */ + return -EAGAIN; + + nr_pages = min(fc->max_pages, readahead_count(rac)); + data->ia = fuse_io_alloc(NULL, nr_pages); + if (!data->ia) + return -ENOMEM; + ia = data->ia; + } + folio_get(folio); + ap = &ia->ap; + ap->folios[ap->num_folios] = folio; + ap->descs[ap->num_folios].offset = off; + ap->descs[ap->num_folios].length = len; + data->nr_bytes += len; + ap->num_folios++; return 0; } +static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, + size_t len) +{ + struct fuse_fill_read_data *data = ctx->read_ctx; + struct folio *folio = ctx->cur_folio; + loff_t pos = iter->pos; + size_t off = offset_in_folio(folio, pos); + struct file *file = data->file; + int ret; + + if (ctx->rac) { + ret = fuse_handle_readahead(folio, ctx->rac, data, pos, len); + } else { + /* + * for non-readahead read requests, do reads synchronously + * since it's not guaranteed that the server can handle + * out-of-order reads + */ + ret = fuse_do_readfolio(file, folio, off, len); + if (!ret) + iomap_finish_folio_read(folio, off, len, ret); + } + return ret; +} + +static void fuse_iomap_read_submit(struct iomap_read_folio_ctx *ctx) +{ + struct fuse_fill_read_data *data = ctx->read_ctx; + + if (data->ia) + fuse_send_readpages(data->ia, data->file, data->nr_bytes, + data->fc->async_read); +} + +static const struct iomap_read_ops fuse_iomap_read_ops = { + .read_folio_range = fuse_iomap_read_folio_range_async, + .submit_read = fuse_iomap_read_submit, +}; + static int fuse_read_folio(struct file *file, struct folio *folio) { struct inode *inode = folio->mapping->host; - int err; + struct fuse_fill_read_data data = { + .file = file, + }; + struct iomap_read_folio_ctx ctx = { + .cur_folio = folio, + .ops = &fuse_iomap_read_ops, + .read_ctx = &data, - err = -EIO; - if (fuse_is_bad(inode)) - goto out; + }; - err = fuse_do_readfolio(file, folio); + if (fuse_is_bad(inode)) { + folio_unlock(folio); + return -EIO; + } + + iomap_read_folio(&fuse_iomap_ops, &ctx); fuse_invalidate_atime(inode); - out: - folio_unlock(folio); - return err; + return 0; +} + +static int fuse_iomap_read_folio_range(const struct iomap_iter *iter, + struct folio *folio, loff_t pos, + size_t len) +{ + struct file *file = iter->private; + size_t off = offset_in_folio(folio, pos); + + return fuse_do_readfolio(file, folio, off, len); } static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, @@ -849,25 +1002,24 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, struct fuse_args_pages *ap = &ia->ap; size_t count = ia->read.in.size; size_t num_read = args->out_args[0].size; - struct address_space *mapping = NULL; - - for (i = 0; mapping == NULL && i < ap->num_folios; i++) - mapping = ap->folios[i]->mapping; + struct address_space *mapping; + struct inode *inode; - if (mapping) { - struct inode *inode = mapping->host; + WARN_ON_ONCE(!ap->num_folios); + mapping = ap->folios[0]->mapping; + inode = mapping->host; - /* - * Short read means EOF. If file size is larger, truncate it - */ - if (!err && num_read < count) - fuse_short_read(inode, ia->read.attr_ver, num_read, ap); + /* + * Short read means EOF. If file size is larger, truncate it + */ + if (!err && num_read < count) + fuse_short_read(inode, ia->read.attr_ver, num_read, ap); - fuse_invalidate_atime(inode); - } + fuse_invalidate_atime(inode); for (i = 0; i < ap->num_folios; i++) { - folio_end_read(ap->folios[i], !err); + iomap_finish_folio_read(ap->folios[i], ap->descs[i].offset, + ap->descs[i].length, err); folio_put(ap->folios[i]); } if (ia->ff) @@ -877,7 +1029,7 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, } static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, - unsigned int count) + unsigned int count, bool async) { struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; @@ -899,7 +1051,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, fuse_read_args_fill(ia, file, pos, count, FUSE_READ); ia->read.attr_ver = fuse_get_attr_version(fm->fc); - if (fm->fc->async_read) { + if (async) { ia->ff = fuse_file_get(ff); ap->args.end = fuse_readpages_end; err = fuse_simple_background(fm, &ap->args, GFP_KERNEL); @@ -916,81 +1068,20 @@ static void fuse_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); - unsigned int max_pages, nr_pages; - struct folio *folio = NULL; + struct fuse_fill_read_data data = { + .file = rac->file, + .fc = fc, + }; + struct iomap_read_folio_ctx ctx = { + .ops = &fuse_iomap_read_ops, + .rac = rac, + .read_ctx = &data + }; if (fuse_is_bad(inode)) return; - max_pages = min_t(unsigned int, fc->max_pages, - fc->max_read / PAGE_SIZE); - - /* - * This is only accurate the first time through, since readahead_folio() - * doesn't update readahead_count() from the previous folio until the - * next call. Grab nr_pages here so we know how many pages we're going - * to have to process. This means that we will exit here with - * readahead_count() == folio_nr_pages(last_folio), but we will have - * consumed all of the folios, and read_pages() will call - * readahead_folio() again which will clean up the rac. - */ - nr_pages = readahead_count(rac); - - while (nr_pages) { - struct fuse_io_args *ia; - struct fuse_args_pages *ap; - unsigned cur_pages = min(max_pages, nr_pages); - unsigned int pages = 0; - - if (fc->num_background >= fc->congestion_threshold && - rac->ra->async_size >= readahead_count(rac)) - /* - * Congested and only async pages left, so skip the - * rest. - */ - break; - - ia = fuse_io_alloc(NULL, cur_pages); - if (!ia) - break; - ap = &ia->ap; - - while (pages < cur_pages) { - unsigned int folio_pages; - - /* - * This returns a folio with a ref held on it. - * The ref needs to be held until the request is - * completed, since the splice case (see - * fuse_try_move_page()) drops the ref after it's - * replaced in the page cache. - */ - if (!folio) - folio = __readahead_folio(rac); - - folio_pages = folio_nr_pages(folio); - if (folio_pages > cur_pages - pages) { - /* - * Large folios belonging to fuse will never - * have more pages than max_pages. - */ - WARN_ON(!pages); - break; - } - - ap->folios[ap->num_folios] = folio; - ap->descs[ap->num_folios].length = folio_size(folio); - ap->num_folios++; - pages += folio_pages; - folio = NULL; - } - fuse_send_readpages(ia, rac->file, pages << PAGE_SHIFT); - nr_pages -= pages; - } - if (folio) { - folio_end_read(folio, false); - folio_put(folio); - } + iomap_readahead(&fuse_iomap_ops, &ctx); } static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -1147,7 +1238,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, struct address_space *mapping, struct iov_iter *ii, loff_t pos, - unsigned int max_pages) + unsigned int max_folios) { struct fuse_args_pages *ap = &ia->ap; struct fuse_conn *fc = get_fuse_conn(mapping->host); @@ -1157,12 +1248,10 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, int err = 0; num = min(iov_iter_count(ii), fc->max_write); - num = min(num, max_pages << PAGE_SHIFT); ap->args.in_pages = true; - ap->descs[0].offset = offset; - while (num) { + while (num && ap->num_folios < max_folios) { size_t tmp; struct folio *folio; pgoff_t index = pos >> PAGE_SHIFT; @@ -1375,6 +1464,10 @@ static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive) } } +static const struct iomap_write_ops fuse_iomap_write_ops = { + .read_folio_range = fuse_iomap_read_folio_range, +}; + static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; @@ -1384,6 +1477,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = mapping->host; ssize_t err, count; struct fuse_conn *fc = get_fuse_conn(inode); + bool writeback = false; if (fc->writeback_cache) { /* Update size (EOF optimization) and mode (SUID clearing) */ @@ -1392,16 +1486,11 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) if (err) return err; - if (fc->handle_killpriv_v2 && - setattr_should_drop_suidgid(idmap, - file_inode(file))) { - goto writethrough; - } - - return generic_file_write_iter(iocb, from); + if (!fc->handle_killpriv_v2 || + !setattr_should_drop_suidgid(idmap, file_inode(file))) + writeback = true; } -writethrough: inode_lock(inode); err = count = generic_write_checks(iocb, from); @@ -1420,6 +1509,15 @@ writethrough: goto out; written = direct_write_fallback(iocb, from, written, fuse_perform_write(iocb, from)); + } else if (writeback) { + /* + * Use iomap so that we can do granular uptodate reads + * and granular dirty tracking for large folios. + */ + written = iomap_file_buffered_write(iocb, from, + &fuse_iomap_ops, + &fuse_iomap_write_ops, + file); } else { written = fuse_perform_write(iocb, from); } @@ -1566,7 +1664,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (!ia) return -ENOMEM; - if (fopen_direct_io && fc->direct_io_allow_mmap) { + if (fopen_direct_io) { res = filemap_write_and_wait_range(mapping, pos, pos + count - 1); if (res) { fuse_io_free(ia); @@ -1640,6 +1738,15 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (res > 0) *ppos = pos; + if (res > 0 && write && fopen_direct_io) { + /* + * As in generic_file_direct_write(), invalidate after the + * write, to invalidate read-ahead cache that may have competed + * with the write. + */ + invalidate_inode_pages2_range(mapping, idx_from, idx_to); + } + return res > 0 ? res : err; } EXPORT_SYMBOL_GPL(fuse_direct_io); @@ -1785,19 +1892,16 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa) struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); - struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - for (i = 0; i < ap->num_folios; i++) { + for (i = 0; i < ap->num_folios; i++) /* * Benchmarks showed that ending writeback within the * scope of the fi->lock alleviates xarray lock * contention and noticeably improves performance. */ - folio_end_writeback(ap->folios[i]); - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - wb_writeout_inc(&bdi->wb); - } + iomap_finish_folio_write(inode, ap->folios[i], + ap->descs[i].length); wake_up(&fi->page_waitq); } @@ -1928,17 +2032,6 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) struct fuse_file *ff; int err; - /* - * Inode is always written before the last reference is dropped and - * hence this should not be reached from reclaim. - * - * Writing back the inode from reclaim can deadlock if the request - * processing itself needs an allocation. Allocations triggering - * reclaim while serving a request can't be prevented, because it can - * involve any number of unrelated userspace processes. - */ - WARN_ON(wbc->for_reclaim); - ff = __fuse_write_file_get(fi); err = fuse_flush_times(inode, ff); if (ff) @@ -1981,19 +2074,17 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, } static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, - uint32_t folio_index) + uint32_t folio_index, loff_t offset, unsigned len) { - struct inode *inode = folio->mapping->host; struct fuse_args_pages *ap = &wpa->ia.ap; ap->folios[folio_index] = folio; - ap->descs[folio_index].offset = 0; - ap->descs[folio_index].length = folio_size(folio); - - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); + ap->descs[folio_index].offset = offset; + ap->descs[folio_index].length = len; } static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, + size_t offset, struct fuse_file *ff) { struct inode *inode = folio->mapping->host; @@ -2006,7 +2097,7 @@ static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio return NULL; fuse_writepage_add_to_bucket(fc, wpa); - fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio), 0); + fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio) + offset, 0); wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; wpa->inode = inode; wpa->ia.ff = ff; @@ -2018,63 +2109,28 @@ static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio return wpa; } -static int fuse_writepage_locked(struct folio *folio) -{ - struct address_space *mapping = folio->mapping; - struct inode *inode = mapping->host; - struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_writepage_args *wpa; - struct fuse_args_pages *ap; - struct fuse_file *ff; - int error = -EIO; - - ff = fuse_write_file_get(fi); - if (!ff) - goto err; - - wpa = fuse_writepage_args_setup(folio, ff); - error = -ENOMEM; - if (!wpa) - goto err_writepage_args; - - ap = &wpa->ia.ap; - ap->num_folios = 1; - - folio_start_writeback(folio); - fuse_writepage_args_page_fill(wpa, folio, 0); - - spin_lock(&fi->lock); - list_add_tail(&wpa->queue_entry, &fi->queued_writes); - fuse_flush_writepages(inode); - spin_unlock(&fi->lock); - - return 0; - -err_writepage_args: - fuse_file_put(ff, false); -err: - mapping_set_error(folio->mapping, error); - return error; -} - struct fuse_fill_wb_data { struct fuse_writepage_args *wpa; struct fuse_file *ff; - struct inode *inode; unsigned int max_folios; - unsigned int nr_pages; + /* + * nr_bytes won't overflow since fuse_folios_need_send() caps + * wb requests to never exceed fc->max_pages (which has an upper bound + * of U16_MAX). + */ + unsigned int nr_bytes; }; -static bool fuse_pages_realloc(struct fuse_fill_wb_data *data) +static bool fuse_pages_realloc(struct fuse_fill_wb_data *data, + unsigned int max_pages) { struct fuse_args_pages *ap = &data->wpa->ia.ap; - struct fuse_conn *fc = get_fuse_conn(data->inode); struct folio **folios; struct fuse_folio_desc *descs; unsigned int nfolios = min_t(unsigned int, max_t(unsigned int, data->max_folios * 2, FUSE_DEFAULT_MAX_PAGES_PER_REQ), - fc->max_pages); + max_pages); WARN_ON(nfolios <= data->max_folios); folios = fuse_folios_alloc(nfolios, GFP_NOFS, &descs); @@ -2091,10 +2147,10 @@ static bool fuse_pages_realloc(struct fuse_fill_wb_data *data) return true; } -static void fuse_writepages_send(struct fuse_fill_wb_data *data) +static void fuse_writepages_send(struct inode *inode, + struct fuse_fill_wb_data *data) { struct fuse_writepage_args *wpa = data->wpa; - struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fi->lock); @@ -2103,195 +2159,156 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) spin_unlock(&fi->lock); } -static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio, - struct fuse_args_pages *ap, - struct fuse_fill_wb_data *data) +static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos, + unsigned len, struct fuse_args_pages *ap, + unsigned cur_bytes, bool write) { + struct folio *prev_folio; + struct fuse_folio_desc prev_desc; + unsigned bytes = cur_bytes + len; + loff_t prev_pos; + size_t max_bytes = write ? fc->max_write : fc->max_read; + WARN_ON(!ap->num_folios); /* Reached max pages */ - if (data->nr_pages + folio_nr_pages(folio) > fc->max_pages) + if ((bytes + PAGE_SIZE - 1) >> PAGE_SHIFT > fc->max_pages) return true; - /* Reached max write bytes */ - if ((data->nr_pages * PAGE_SIZE) + folio_size(folio) > fc->max_write) + if (bytes > max_bytes) return true; /* Discontinuity */ - if (folio_next_index(ap->folios[ap->num_folios - 1]) != folio->index) - return true; - - /* Need to grow the pages array? If so, did the expansion fail? */ - if (ap->num_folios == data->max_folios && !fuse_pages_realloc(data)) + prev_folio = ap->folios[ap->num_folios - 1]; + prev_desc = ap->descs[ap->num_folios - 1]; + prev_pos = folio_pos(prev_folio) + prev_desc.offset + prev_desc.length; + if (prev_pos != pos) return true; return false; } -static int fuse_writepages_fill(struct folio *folio, - struct writeback_control *wbc, void *_data) +static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc, + struct folio *folio, u64 pos, + unsigned len, u64 end_pos) { - struct fuse_fill_wb_data *data = _data; + struct fuse_fill_wb_data *data = wpc->wb_ctx; struct fuse_writepage_args *wpa = data->wpa; struct fuse_args_pages *ap = &wpa->ia.ap; - struct inode *inode = data->inode; + struct inode *inode = wpc->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - int err; + loff_t offset = offset_in_folio(folio, pos); + + WARN_ON_ONCE(!data); if (!data->ff) { - err = -EIO; data->ff = fuse_write_file_get(fi); if (!data->ff) - goto out_unlock; + return -EIO; } - if (wpa && fuse_writepage_need_send(fc, folio, ap, data)) { - fuse_writepages_send(data); - data->wpa = NULL; - data->nr_pages = 0; + if (wpa) { + bool send = fuse_folios_need_send(fc, pos, len, ap, + data->nr_bytes, true); + + if (!send) { + /* + * Need to grow the pages array? If so, did the + * expansion fail? + */ + send = (ap->num_folios == data->max_folios) && + !fuse_pages_realloc(data, fc->max_pages); + } + + if (send) { + fuse_writepages_send(inode, data); + data->wpa = NULL; + data->nr_bytes = 0; + } } if (data->wpa == NULL) { - err = -ENOMEM; - wpa = fuse_writepage_args_setup(folio, data->ff); + wpa = fuse_writepage_args_setup(folio, offset, data->ff); if (!wpa) - goto out_unlock; + return -ENOMEM; fuse_file_get(wpa->ia.ff); data->max_folios = 1; ap = &wpa->ia.ap; } - folio_start_writeback(folio); - fuse_writepage_args_page_fill(wpa, folio, ap->num_folios); - data->nr_pages += folio_nr_pages(folio); + fuse_writepage_args_page_fill(wpa, folio, ap->num_folios, + offset, len); + data->nr_bytes += len; - err = 0; ap->num_folios++; if (!data->wpa) data->wpa = wpa; -out_unlock: - folio_unlock(folio); - return err; + return len; } -static int fuse_writepages(struct address_space *mapping, - struct writeback_control *wbc) +static int fuse_iomap_writeback_submit(struct iomap_writepage_ctx *wpc, + int error) { - struct inode *inode = mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_fill_wb_data data; - int err; - - err = -EIO; - if (fuse_is_bad(inode)) - goto out; - - if (wbc->sync_mode == WB_SYNC_NONE && - fc->num_background >= fc->congestion_threshold) - return 0; + struct fuse_fill_wb_data *data = wpc->wb_ctx; - data.inode = inode; - data.wpa = NULL; - data.ff = NULL; - data.nr_pages = 0; + WARN_ON_ONCE(!data); - err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); - if (data.wpa) { - WARN_ON(!data.wpa->ia.ap.num_folios); - fuse_writepages_send(&data); + if (data->wpa) { + WARN_ON(!data->wpa->ia.ap.num_folios); + fuse_writepages_send(wpc->inode, data); } - if (data.ff) - fuse_file_put(data.ff, false); -out: - return err; -} - -/* - * It's worthy to make sure that space is reserved on disk for the write, - * but how to implement it without killing performance need more thinking. - */ -static int fuse_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct folio **foliop, void **fsdata) -{ - pgoff_t index = pos >> PAGE_SHIFT; - struct fuse_conn *fc = get_fuse_conn(file_inode(file)); - struct folio *folio; - loff_t fsize; - int err = -ENOMEM; + if (data->ff) + fuse_file_put(data->ff, false); - WARN_ON(!fc->writeback_cache); - - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, - mapping_gfp_mask(mapping)); - if (IS_ERR(folio)) - goto error; - - if (folio_test_uptodate(folio) || len >= folio_size(folio)) - goto success; - /* - * Check if the start of this folio comes after the end of file, - * in which case the readpage can be optimized away. - */ - fsize = i_size_read(mapping->host); - if (fsize <= folio_pos(folio)) { - size_t off = offset_in_folio(folio, pos); - if (off) - folio_zero_segment(folio, 0, off); - goto success; - } - err = fuse_do_readfolio(file, folio); - if (err) - goto cleanup; -success: - *foliop = folio; - return 0; - -cleanup: - folio_unlock(folio); - folio_put(folio); -error: - return err; + return error; } -static int fuse_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) -{ - struct inode *inode = folio->mapping->host; - - /* Haven't copied anything? Skip zeroing, size extending, dirtying. */ - if (!copied) - goto unlock; - - pos += copied; - if (!folio_test_uptodate(folio)) { - /* Zero any unwritten bytes at the end of the page */ - size_t endoff = pos & ~PAGE_MASK; - if (endoff) - folio_zero_segment(folio, endoff, PAGE_SIZE); - folio_mark_uptodate(folio); - } +static const struct iomap_writeback_ops fuse_writeback_ops = { + .writeback_range = fuse_iomap_writeback_range, + .writeback_submit = fuse_iomap_writeback_submit, +}; - if (pos > inode->i_size) - i_size_write(inode, pos); +static int fuse_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_fill_wb_data data = {}; + struct iomap_writepage_ctx wpc = { + .inode = inode, + .iomap.type = IOMAP_MAPPED, + .wbc = wbc, + .ops = &fuse_writeback_ops, + .wb_ctx = &data, + }; - folio_mark_dirty(folio); + if (fuse_is_bad(inode)) + return -EIO; -unlock: - folio_unlock(folio); - folio_put(folio); + if (wbc->sync_mode == WB_SYNC_NONE && + fc->num_background >= fc->congestion_threshold) + return 0; - return copied; + return iomap_writepages(&wpc); } static int fuse_launder_folio(struct folio *folio) { int err = 0; + struct fuse_fill_wb_data data = {}; + struct iomap_writepage_ctx wpc = { + .inode = folio->mapping->host, + .iomap.type = IOMAP_MAPPED, + .ops = &fuse_writeback_ops, + .wb_ctx = &data, + }; + if (folio_clear_dirty_for_io(folio)) { - err = fuse_writepage_locked(folio); + err = iomap_writeback_folio(&wpc, folio); + err = fuse_iomap_writeback_submit(&wpc, err); if (!err) folio_wait_writeback(folio); } @@ -3018,6 +3035,8 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, .flags = flags }; struct fuse_write_out outarg; + struct fuse_copy_file_range_out outarg_64; + u64 bytes_copied; ssize_t err; /* mark unstable when write-back is not used, and file_out gets * extended */ @@ -3067,30 +3086,51 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (is_unstable) set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); - args.opcode = FUSE_COPY_FILE_RANGE; + args.opcode = FUSE_COPY_FILE_RANGE_64; args.nodeid = ff_in->nodeid; args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.out_numargs = 1; - args.out_args[0].size = sizeof(outarg); - args.out_args[0].value = &outarg; + args.out_args[0].size = sizeof(outarg_64); + args.out_args[0].value = &outarg_64; + if (fc->no_copy_file_range_64) { +fallback: + /* Fall back to old op that can't handle large copy length */ + args.opcode = FUSE_COPY_FILE_RANGE; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + inarg.len = len = min_t(size_t, len, UINT_MAX & PAGE_MASK); + } err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_copy_file_range = 1; - err = -EOPNOTSUPP; + if (fc->no_copy_file_range_64) { + fc->no_copy_file_range = 1; + err = -EOPNOTSUPP; + } else { + fc->no_copy_file_range_64 = 1; + goto fallback; + } } if (err) goto out; + bytes_copied = fc->no_copy_file_range_64 ? + outarg.size : outarg_64.bytes_copied; + + if (bytes_copied > len) { + err = -EIO; + goto out; + } + truncate_inode_pages_range(inode_out->i_mapping, ALIGN_DOWN(pos_out, PAGE_SIZE), - ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1); + ALIGN(pos_out + bytes_copied, PAGE_SIZE) - 1); file_update_time(file_out); - fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size); + fuse_write_update_attr(inode_out, pos_out + bytes_copied, bytes_copied); - err = outarg.size; + err = bytes_copied; out: if (is_unstable) clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); @@ -3144,12 +3184,13 @@ static const struct address_space_operations fuse_file_aops = { .readahead = fuse_readahead, .writepages = fuse_writepages, .launder_folio = fuse_launder_folio, - .dirty_folio = filemap_dirty_folio, + .dirty_folio = iomap_dirty_folio, + .release_folio = iomap_release_folio, + .invalidate_folio = iomap_invalidate_folio, + .is_partially_uptodate = iomap_is_partially_uptodate, .migrate_folio = filemap_migrate_folio, .bmap = fuse_bmap, .direct_IO = fuse_direct_IO, - .write_begin = fuse_write_begin, - .write_end = fuse_write_end, }; void fuse_init_file_inode(struct inode *inode, unsigned int flags) diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 5a9bd771a319..134bf44aff0d 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -12,6 +12,8 @@ #define FUSE_INT_REQ_BIT (1ULL << 0) #define FUSE_REQ_ID_STEP (1ULL << 1) +extern struct wait_queue_head fuse_dev_waitq; + struct fuse_arg; struct fuse_args; struct fuse_pqueue; @@ -37,15 +39,22 @@ struct fuse_copy_state { } ring; }; -static inline struct fuse_dev *fuse_get_dev(struct file *file) +#define FUSE_DEV_SYNC_INIT ((struct fuse_dev *) 1) +#define FUSE_DEV_PTR_MASK (~1UL) + +static inline struct fuse_dev *__fuse_get_dev(struct file *file) { /* * Lockless access is OK, because file->private data is set * once during mount and is valid until the file is released. */ - return READ_ONCE(file->private_data); + struct fuse_dev *fud = READ_ONCE(file->private_data); + + return (typeof(fud)) ((unsigned long) fud & FUSE_DEV_PTR_MASK); } +struct fuse_dev *fuse_get_dev(struct file *file); + unsigned int fuse_req_hash(u64 unique); struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique); @@ -53,6 +62,7 @@ void fuse_dev_end_requests(struct list_head *head); void fuse_copy_init(struct fuse_copy_state *cs, bool write, struct iov_iter *iter); +void fuse_copy_finish(struct fuse_copy_state *cs); int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs, unsigned int argpages, struct fuse_arg *args, int zeroing); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index b54f4f57789f..7f16049387d1 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -54,6 +54,13 @@ /** Frequency (in jiffies) of request timeout checks, if opted into */ extern const unsigned long fuse_timeout_timer_freq; +/* + * Dentries invalidation workqueue period, in seconds. The value of this + * parameter shall be >= FUSE_DENTRY_INVAL_FREQ_MIN seconds, or 0 (zero), in + * which case no workqueue will be created. + */ +extern unsigned inval_wq __read_mostly; + /** Maximum of max_pages received in init_out */ extern unsigned int fuse_max_pages_limit; /* @@ -210,6 +217,12 @@ struct fuse_inode { /** Reference to backing file in passthrough mode */ struct fuse_backing *fb; #endif + + /* + * The underlying inode->i_blkbits value will not be modified, + * so preserve the blocksize specified by the server. + */ + u8 cached_i_blkbits; }; /** FUSE inode state bits */ @@ -226,6 +239,11 @@ enum { FUSE_I_BTIME, /* Wants or already has page cache IO */ FUSE_I_CACHE_IO_MODE, + /* + * Client has exclusive access to the inode, either because fs is local + * or the fuse server has an exclusive "lease" on distributed fs + */ + FUSE_I_EXCLUSIVE, }; struct fuse_conn; @@ -636,6 +654,8 @@ struct fuse_conn { /** Current epoch for up-to-date dentries */ atomic_t epoch; + struct work_struct epoch_work; + struct rcu_head rcu; /** The user id for this mount */ @@ -850,6 +870,9 @@ struct fuse_conn { /** Does the filesystem support copy_file_range? */ unsigned no_copy_file_range:1; + /** Does the filesystem support copy_file_range_64? */ + unsigned no_copy_file_range_64:1; + /* Send DESTROY request */ unsigned int destroy:1; @@ -895,6 +918,9 @@ struct fuse_conn { /* Is link not implemented by fs? */ unsigned int no_link:1; + /* Is synchronous FUSE_INIT allowed? */ + unsigned int sync_init:1; + /* Use io_uring for communication */ unsigned int io_uring; @@ -913,12 +939,6 @@ struct fuse_conn { /** Device ID from the root super block */ dev_t dev; - /** Dentries in the control filesystem */ - struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES]; - - /** number of dentries used in the above array */ - int ctl_ndents; - /** Key for lock owner ID scrambling */ u32 scramble_key[4]; @@ -1032,7 +1052,7 @@ static inline struct fuse_conn *get_fuse_conn(struct inode *inode) return get_fuse_mount_super(inode->i_sb)->fc; } -static inline struct fuse_inode *get_fuse_inode(struct inode *inode) +static inline struct fuse_inode *get_fuse_inode(const struct inode *inode) { return container_of(inode, struct fuse_inode, inode); } @@ -1074,6 +1094,13 @@ static inline bool fuse_is_bad(struct inode *inode) return unlikely(test_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state)); } +static inline bool fuse_inode_is_exclusive(const struct inode *inode) +{ + const struct fuse_inode *fi = get_fuse_inode(inode); + + return test_bit(FUSE_I_EXCLUSIVE, &fi->state); +} + static inline struct folio **fuse_folios_alloc(unsigned int nfolios, gfp_t flags, struct fuse_folio_desc **desc) { @@ -1109,7 +1136,6 @@ static inline void fuse_sync_bucket_dec(struct fuse_sync_bucket *bucket) extern const struct file_operations fuse_dev_operations; extern const struct dentry_operations fuse_dentry_operations; -extern const struct dentry_operations fuse_root_dentry_operations; /** * Get a filled in inode @@ -1248,6 +1274,11 @@ int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, gfp_t gfp_flags); /** + * Assign a unique id to a fuse request + */ +void fuse_request_assign_unique(struct fuse_iqueue *fiq, struct fuse_req *req); + +/** * End a finished request */ void fuse_request_end(struct fuse_req *req); @@ -1259,6 +1290,11 @@ void fuse_wait_aborted(struct fuse_conn *fc); /* Check if any requests timed out */ void fuse_check_timeout(struct work_struct *work); +void fuse_dentry_tree_init(void); +void fuse_dentry_tree_cleanup(void); + +void fuse_epoch_work(struct work_struct *work); + /** * Invalidate inode attributes */ @@ -1308,7 +1344,7 @@ struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc); struct fuse_dev *fuse_dev_alloc(void); void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc); void fuse_dev_free(struct fuse_dev *fud); -void fuse_send_init(struct fuse_mount *fm); +int fuse_send_init(struct fuse_mount *fm); /** * Fill in superblock and initialize fuse connection @@ -1400,6 +1436,12 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, u64 child_nodeid, struct qstr *name, u32 flags); +/* + * Try to prune this inode. If neither the inode itself nor dentries associated + * with this inode have any external reference, then the inode can be freed. + */ +void fuse_try_prune_one_inode(struct fuse_conn *fc, u64 nodeid); + int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, bool isdir); @@ -1486,9 +1528,9 @@ void fuse_dax_cancel_work(struct fuse_conn *fc); long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int fuse_fileattr_get(struct dentry *dentry, struct file_kattr *fa); int fuse_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); /* iomode.c */ int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff); @@ -1505,29 +1547,11 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, void fuse_file_release(struct inode *inode, struct fuse_file *ff, unsigned int open_flags, fl_owner_t id, bool isdir); -/* passthrough.c */ -static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi) -{ -#ifdef CONFIG_FUSE_PASSTHROUGH - return READ_ONCE(fi->fb); -#else - return NULL; -#endif -} - -static inline struct fuse_backing *fuse_inode_backing_set(struct fuse_inode *fi, - struct fuse_backing *fb) -{ -#ifdef CONFIG_FUSE_PASSTHROUGH - return xchg(&fi->fb, fb); -#else - return NULL; -#endif -} - +/* backing.c */ #ifdef CONFIG_FUSE_PASSTHROUGH struct fuse_backing *fuse_backing_get(struct fuse_backing *fb); void fuse_backing_put(struct fuse_backing *fb); +struct fuse_backing *fuse_backing_lookup(struct fuse_conn *fc, int backing_id); #else static inline struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) @@ -1538,6 +1562,11 @@ static inline struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) static inline void fuse_backing_put(struct fuse_backing *fb) { } +static inline struct fuse_backing *fuse_backing_lookup(struct fuse_conn *fc, + int backing_id) +{ + return NULL; +} #endif void fuse_backing_files_init(struct fuse_conn *fc); @@ -1545,9 +1574,27 @@ void fuse_backing_files_free(struct fuse_conn *fc); int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map); int fuse_backing_close(struct fuse_conn *fc, int backing_id); -struct fuse_backing *fuse_passthrough_open(struct file *file, - struct inode *inode, - int backing_id); +/* passthrough.c */ +static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi) +{ +#ifdef CONFIG_FUSE_PASSTHROUGH + return READ_ONCE(fi->fb); +#else + return NULL; +#endif +} + +static inline struct fuse_backing *fuse_inode_backing_set(struct fuse_inode *fi, + struct fuse_backing *fb) +{ +#ifdef CONFIG_FUSE_PASSTHROUGH + return xchg(&fi->fb, fb); +#else + return NULL; +#endif +} + +struct fuse_backing *fuse_passthrough_open(struct file *file, int backing_id); void fuse_passthrough_release(struct fuse_file *ff, struct fuse_backing *fb); static inline struct file *fuse_file_passthrough(struct fuse_file *ff) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index bfe8d8af46f3..819e50d66622 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -7,8 +7,10 @@ */ #include "fuse_i.h" +#include "fuse_dev_i.h" #include "dev_uring_i.h" +#include <linux/dax.h> #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/file.h> @@ -33,6 +35,7 @@ MODULE_LICENSE("GPL"); static struct kmem_cache *fuse_inode_cachep; struct list_head fuse_conn_list; DEFINE_MUTEX(fuse_mutex); +DECLARE_WAIT_QUEUE_HEAD(fuse_dev_waitq); static int set_global_limit(const char *val, const struct kernel_param *kp); @@ -100,14 +103,11 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) if (!fi) return NULL; - fi->i_time = 0; + /* Initialize private data (i.e. everything except fi->inode) */ + BUILD_BUG_ON(offsetof(struct fuse_inode, inode) != 0); + memset((void *) fi + sizeof(fi->inode), 0, sizeof(*fi) - sizeof(fi->inode)); + fi->inval_mask = ~0; - fi->nodeid = 0; - fi->nlookup = 0; - fi->attr_version = 0; - fi->orig_ino = 0; - fi->state = 0; - fi->submount_lookup = NULL; mutex_init(&fi->mutex); spin_lock_init(&fi->lock); fi->forget = fuse_alloc_forget(); @@ -160,7 +160,10 @@ static void fuse_evict_inode(struct inode *inode) struct fuse_inode *fi = get_fuse_inode(inode); /* Will write inode on close/munmap and in all other dirtiers */ - WARN_ON(inode->i_state & I_DIRTY_INODE); + WARN_ON(inode_state_read_once(inode) & I_DIRTY_INODE); + + if (FUSE_IS_DAX(inode)) + dax_break_layout_final(inode); truncate_inode_pages_final(&inode->i_data); clear_inode(inode); @@ -285,10 +288,10 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, } } - if (attr->blksize != 0) - inode->i_blkbits = ilog2(attr->blksize); + if (attr->blksize) + fi->cached_i_blkbits = ilog2(attr->blksize); else - inode->i_blkbits = inode->i_sb->s_blocksize_bits; + fi->cached_i_blkbits = inode->i_sb->s_blocksize_bits; /* * Don't set the sticky bit in i_mode, unless we want the VFS @@ -502,7 +505,7 @@ retry: if (!inode) return NULL; - if ((inode->i_state & I_NEW)) { + if ((inode_state_read_once(inode) & I_NEW)) { inode->i_flags |= S_NOATIME; if (!fc->writeback_cache || !S_ISREG(attr->mode)) inode->i_flags |= S_NOCMTIME; @@ -582,6 +585,17 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, return 0; } +void fuse_try_prune_one_inode(struct fuse_conn *fc, u64 nodeid) +{ + struct inode *inode; + + inode = fuse_ilookup(fc, nodeid, NULL); + if (!inode) + return; + d_prune_aliases(inode); + iput(inode); +} + bool fuse_lock_inode(struct inode *inode) { bool locked = false; @@ -963,6 +977,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); atomic_set(&fc->epoch, 1); + INIT_WORK(&fc->epoch_work, fuse_epoch_work); init_waitqueue_head(&fc->blocked_waitq); fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv); INIT_LIST_HEAD(&fc->bg_queue); @@ -1007,26 +1022,28 @@ static void delayed_release(struct rcu_head *p) void fuse_conn_put(struct fuse_conn *fc) { - if (refcount_dec_and_test(&fc->count)) { - struct fuse_iqueue *fiq = &fc->iq; - struct fuse_sync_bucket *bucket; - - if (IS_ENABLED(CONFIG_FUSE_DAX)) - fuse_dax_conn_free(fc); - if (fc->timeout.req_timeout) - cancel_delayed_work_sync(&fc->timeout.work); - if (fiq->ops->release) - fiq->ops->release(fiq); - put_pid_ns(fc->pid_ns); - bucket = rcu_dereference_protected(fc->curr_bucket, 1); - if (bucket) { - WARN_ON(atomic_read(&bucket->count) != 1); - kfree(bucket); - } - if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) - fuse_backing_files_free(fc); - call_rcu(&fc->rcu, delayed_release); + struct fuse_iqueue *fiq = &fc->iq; + struct fuse_sync_bucket *bucket; + + if (!refcount_dec_and_test(&fc->count)) + return; + + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_conn_free(fc); + if (fc->timeout.req_timeout) + cancel_delayed_work_sync(&fc->timeout.work); + cancel_work_sync(&fc->epoch_work); + if (fiq->ops->release) + fiq->ops->release(fiq); + put_pid_ns(fc->pid_ns); + bucket = rcu_dereference_protected(fc->curr_bucket, 1); + if (bucket) { + WARN_ON(atomic_read(&bucket->count) != 1); + kfree(bucket); } + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_backing_files_free(fc); + call_rcu(&fc->rcu, delayed_release); } EXPORT_SYMBOL_GPL(fuse_conn_put); @@ -1205,7 +1222,7 @@ static const struct super_operations fuse_super_operations = { .free_inode = fuse_free_inode, .evict_inode = fuse_evict_inode, .write_inode = fuse_write_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .umount_begin = fuse_umount_begin, .statfs = fuse_statfs, .sync_fs = fuse_sync_fs, @@ -1269,7 +1286,7 @@ static void set_request_timeout(struct fuse_conn *fc, unsigned int timeout) { fc->timeout.req_timeout = secs_to_jiffies(timeout); INIT_DELAYED_WORK(&fc->timeout.work, fuse_check_timeout); - queue_delayed_work(system_wq, &fc->timeout.work, + queue_delayed_work(system_percpu_wq, &fc->timeout.work, fuse_timeout_timer_freq); } @@ -1465,7 +1482,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, wake_up_all(&fc->blocked_waitq); } -void fuse_send_init(struct fuse_mount *fm) +static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm) { struct fuse_init_args *ia; u64 flags; @@ -1524,10 +1541,30 @@ void fuse_send_init(struct fuse_mount *fm) ia->args.out_args[0].value = &ia->out; ia->args.force = true; ia->args.nocreds = true; - ia->args.end = process_init_reply; - if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0) - process_init_reply(fm, &ia->args, -ENOTCONN); + return ia; +} + +int fuse_send_init(struct fuse_mount *fm) +{ + struct fuse_init_args *ia = fuse_new_init(fm); + int err; + + if (fm->fc->sync_init) { + err = fuse_simple_request(fm, &ia->args); + /* Ignore size of init reply */ + if (err > 0) + err = 0; + } else { + ia->args.end = process_init_reply; + err = fuse_simple_background(fm, &ia->args, GFP_KERNEL); + if (!err) + return 0; + } + process_init_reply(fm, &ia->args, err); + if (fm->fc->conn_error) + return -ENOTCONN; + return 0; } EXPORT_SYMBOL_GPL(fuse_send_init); @@ -1557,8 +1594,6 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) if (err) return err; - /* fuse does it's own writeback accounting */ - sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT; sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT; /* @@ -1715,7 +1750,7 @@ static int fuse_fill_super_submount(struct super_block *sb, fi = get_fuse_inode(root); fi->nlookup--; - sb->s_d_op = &fuse_dentry_operations; + set_default_d_op(sb, &fuse_dentry_operations); sb->s_root = d_make_root(root); if (!sb->s_root) return -ENOMEM; @@ -1807,6 +1842,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) if (!sb_set_blocksize(sb, ctx->blksize)) goto err; #endif + fc->sync_fs = 1; } else { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; @@ -1850,17 +1886,19 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) err = -ENOMEM; root = fuse_get_root_inode(sb, ctx->rootmode); - sb->s_d_op = &fuse_root_dentry_operations; + set_default_d_op(sb, &fuse_dentry_operations); root_dentry = d_make_root(root); if (!root_dentry) goto err_dev_free; - /* Root dentry doesn't have .d_revalidate */ - sb->s_d_op = &fuse_dentry_operations; mutex_lock(&fuse_mutex); err = -EINVAL; - if (ctx->fudptr && *ctx->fudptr) - goto err_unlock; + if (ctx->fudptr && *ctx->fudptr) { + if (*ctx->fudptr == FUSE_DEV_SYNC_INIT) + fc->sync_init = 1; + else + goto err_unlock; + } err = fuse_ctl_add_conn(fc); if (err) @@ -1868,8 +1906,10 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) list_add_tail(&fc->entry, &fuse_conn_list); sb->s_root = root_dentry; - if (ctx->fudptr) + if (ctx->fudptr) { *ctx->fudptr = fud; + wake_up_all(&fuse_dev_waitq); + } mutex_unlock(&fuse_mutex); return 0; @@ -1890,6 +1930,7 @@ EXPORT_SYMBOL_GPL(fuse_fill_super_common); static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; + struct fuse_mount *fm; int err; if (!ctx->file || !ctx->rootmode_present || @@ -1910,8 +1951,10 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) return err; /* file->private_data shall be visible on all CPUs after this */ smp_mb(); - fuse_send_init(get_fuse_mount_super(sb)); - return 0; + + fm = get_fuse_mount_super(sb); + + return fuse_send_init(fm); } /* @@ -1972,7 +2015,7 @@ static int fuse_get_tree(struct fs_context *fsc) * Allow creating a fuse mount with an already initialized fuse * connection */ - fud = READ_ONCE(ctx->file->private_data); + fud = __fuse_get_dev(ctx->file); if (ctx->file->f_op == &fuse_dev_operations && fud) { fsc->sget_key = fud->fc; sb = sget_fc(fsc, fuse_test_super, fuse_set_no_super); @@ -2243,6 +2286,8 @@ static int __init fuse_init(void) if (res) goto err_sysfs_cleanup; + fuse_dentry_tree_init(); + sanitize_global_limit(&max_user_bgreq); sanitize_global_limit(&max_user_congthresh); @@ -2262,6 +2307,7 @@ static void __exit fuse_exit(void) { pr_debug("exit\n"); + fuse_dentry_tree_cleanup(); fuse_ctl_cleanup(); fuse_sysfs_cleanup(); fuse_fs_cleanup(); diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 2d9abf48828f..fdc175e93f74 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -502,7 +502,7 @@ static void fuse_priv_ioctl_cleanup(struct inode *inode, struct fuse_file *ff) fuse_file_release(inode, ff, O_RDONLY, NULL, S_ISDIR(inode->i_mode)); } -int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int fuse_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct fuse_file *ff; @@ -540,7 +540,7 @@ cleanup: } int fuse_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct fuse_file *ff; diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c index c99e285f3183..3728933188f3 100644 --- a/fs/fuse/iomode.c +++ b/fs/fuse/iomode.c @@ -177,8 +177,7 @@ static int fuse_file_passthrough_open(struct inode *inode, struct file *file) (ff->open_flags & ~FOPEN_PASSTHROUGH_MASK)) return -EINVAL; - fb = fuse_passthrough_open(file, inode, - ff->args->open_outarg.backing_id); + fb = fuse_passthrough_open(file, ff->args->open_outarg.backing_id); if (IS_ERR(fb)) return PTR_ERR(fb); diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c index 607ef735ad4a..72de97c03d0e 100644 --- a/fs/fuse/passthrough.c +++ b/fs/fuse/passthrough.c @@ -144,166 +144,12 @@ ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma) return backing_file_mmap(backing_file, vma, &ctx); } -struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) -{ - if (fb && refcount_inc_not_zero(&fb->count)) - return fb; - return NULL; -} - -static void fuse_backing_free(struct fuse_backing *fb) -{ - pr_debug("%s: fb=0x%p\n", __func__, fb); - - if (fb->file) - fput(fb->file); - put_cred(fb->cred); - kfree_rcu(fb, rcu); -} - -void fuse_backing_put(struct fuse_backing *fb) -{ - if (fb && refcount_dec_and_test(&fb->count)) - fuse_backing_free(fb); -} - -void fuse_backing_files_init(struct fuse_conn *fc) -{ - idr_init(&fc->backing_files_map); -} - -static int fuse_backing_id_alloc(struct fuse_conn *fc, struct fuse_backing *fb) -{ - int id; - - idr_preload(GFP_KERNEL); - spin_lock(&fc->lock); - /* FIXME: xarray might be space inefficient */ - id = idr_alloc_cyclic(&fc->backing_files_map, fb, 1, 0, GFP_ATOMIC); - spin_unlock(&fc->lock); - idr_preload_end(); - - WARN_ON_ONCE(id == 0); - return id; -} - -static struct fuse_backing *fuse_backing_id_remove(struct fuse_conn *fc, - int id) -{ - struct fuse_backing *fb; - - spin_lock(&fc->lock); - fb = idr_remove(&fc->backing_files_map, id); - spin_unlock(&fc->lock); - - return fb; -} - -static int fuse_backing_id_free(int id, void *p, void *data) -{ - struct fuse_backing *fb = p; - - WARN_ON_ONCE(refcount_read(&fb->count) != 1); - fuse_backing_free(fb); - return 0; -} - -void fuse_backing_files_free(struct fuse_conn *fc) -{ - idr_for_each(&fc->backing_files_map, fuse_backing_id_free, NULL); - idr_destroy(&fc->backing_files_map); -} - -int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map) -{ - struct file *file; - struct super_block *backing_sb; - struct fuse_backing *fb = NULL; - int res; - - pr_debug("%s: fd=%d flags=0x%x\n", __func__, map->fd, map->flags); - - /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ - res = -EPERM; - if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) - goto out; - - res = -EINVAL; - if (map->flags || map->padding) - goto out; - - file = fget_raw(map->fd); - res = -EBADF; - if (!file) - goto out; - - backing_sb = file_inode(file)->i_sb; - res = -ELOOP; - if (backing_sb->s_stack_depth >= fc->max_stack_depth) - goto out_fput; - - fb = kmalloc(sizeof(struct fuse_backing), GFP_KERNEL); - res = -ENOMEM; - if (!fb) - goto out_fput; - - fb->file = file; - fb->cred = prepare_creds(); - refcount_set(&fb->count, 1); - - res = fuse_backing_id_alloc(fc, fb); - if (res < 0) { - fuse_backing_free(fb); - fb = NULL; - } - -out: - pr_debug("%s: fb=0x%p, ret=%i\n", __func__, fb, res); - - return res; - -out_fput: - fput(file); - goto out; -} - -int fuse_backing_close(struct fuse_conn *fc, int backing_id) -{ - struct fuse_backing *fb = NULL; - int err; - - pr_debug("%s: backing_id=%d\n", __func__, backing_id); - - /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ - err = -EPERM; - if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) - goto out; - - err = -EINVAL; - if (backing_id <= 0) - goto out; - - err = -ENOENT; - fb = fuse_backing_id_remove(fc, backing_id); - if (!fb) - goto out; - - fuse_backing_put(fb); - err = 0; -out: - pr_debug("%s: fb=0x%p, err=%i\n", __func__, fb, err); - - return err; -} - /* * Setup passthrough to a backing file. * * Returns an fb object with elevated refcount to be stored in fuse inode. */ -struct fuse_backing *fuse_passthrough_open(struct file *file, - struct inode *inode, - int backing_id) +struct fuse_backing *fuse_passthrough_open(struct file *file, int backing_id) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fm->fc; @@ -315,12 +161,8 @@ struct fuse_backing *fuse_passthrough_open(struct file *file, if (backing_id <= 0) goto out; - rcu_read_lock(); - fb = idr_find(&fc->backing_files_map, backing_id); - fb = fuse_backing_get(fb); - rcu_read_unlock(); - err = -ENOENT; + fb = fuse_backing_lookup(fc, backing_id); if (!fb) goto out; diff --git a/fs/fuse/trace.c b/fs/fuse/trace.c new file mode 100644 index 000000000000..93bd72efc98c --- /dev/null +++ b/fs/fuse/trace.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "dev_uring_i.h" +#include "fuse_i.h" +#include "fuse_dev_i.h" + +#include <linux/pagemap.h> + +#define CREATE_TRACE_POINTS +#include "fuse_trace.h" diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 53c2626e90e7..b2f6486fe1d5 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -9,7 +9,6 @@ #include <linux/pci.h> #include <linux/interrupt.h> #include <linux/group_cpus.h> -#include <linux/pfn_t.h> #include <linux/memremap.h> #include <linux/module.h> #include <linux/virtio.h> @@ -21,6 +20,7 @@ #include <linux/cleanup.h> #include <linux/uio.h> #include "fuse_i.h" +#include "fuse_dev_i.h" /* Used to help calculate the FUSE connection's max_pages limit for a request's * size. Parts of the struct fuse_req are sliced into scattergather lists in @@ -373,7 +373,7 @@ static int virtio_fs_add_queues_sysfs(struct virtio_fs *fs) sprintf(buff, "%d", i); fsvq->kobj = kobject_create_and_add(buff, fs->mqs_kobj); - if (!fs->mqs_kobj) { + if (!fsvq->kobj) { ret = -ENOMEM; goto out_del; } @@ -762,7 +762,6 @@ static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req) static void virtio_fs_request_complete(struct fuse_req *req, struct virtio_fs_vq *fsvq) { - struct fuse_pqueue *fpq = &fsvq->fud->pq; struct fuse_args *args; struct fuse_args_pages *ap; unsigned int len, i, thislen; @@ -791,9 +790,7 @@ static void virtio_fs_request_complete(struct fuse_req *req, } } - spin_lock(&fpq->lock); clear_bit(FR_SENT, &req->flags); - spin_unlock(&fpq->lock); fuse_request_end(req); spin_lock(&fsvq->lock); @@ -862,7 +859,7 @@ static void virtio_fs_requests_done_work(struct work_struct *work) static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *fs) { const struct cpumask *mask, *masks; - unsigned int q, cpu; + unsigned int q, cpu, nr_masks; /* First attempt to map using existing transport layer affinities * e.g. PCIe MSI-X @@ -882,7 +879,7 @@ static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *f return; fallback: /* Attempt to map evenly in groups over the CPUs */ - masks = group_cpus_evenly(fs->num_request_queues); + masks = group_cpus_evenly(fs->num_request_queues, &nr_masks); /* If even this fails we default to all CPUs use first request queue */ if (!masks) { for_each_possible_cpu(cpu) @@ -891,7 +888,7 @@ fallback: } for (q = 0; q < fs->num_request_queues; q++) { - for_each_cpu(cpu, &masks[q]) + for_each_cpu(cpu, &masks[q % nr_masks]) fs->mq_map[cpu] = q + VQ_REQUEST; } kfree(masks); @@ -1008,7 +1005,7 @@ static void virtio_fs_cleanup_vqs(struct virtio_device *vdev) */ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, - void **kaddr, pfn_t *pfn) + void **kaddr, unsigned long *pfn) { struct virtio_fs *fs = dax_get_private(dax_dev); phys_addr_t offset = PFN_PHYS(pgoff); @@ -1017,7 +1014,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, if (kaddr) *kaddr = fs->window_kaddr + offset; if (pfn) - *pfn = phys_to_pfn_t(fs->window_phys_addr + offset, 0); + *pfn = PHYS_PFN(fs->window_phys_addr + offset); return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; } @@ -1385,7 +1382,7 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, unsigned int out_sgs = 0; unsigned int in_sgs = 0; unsigned int total_sgs; - unsigned int i; + unsigned int i, hash; int ret; bool notify; struct fuse_pqueue *fpq; @@ -1445,8 +1442,9 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, /* Request successfully sent. */ fpq = &fsvq->fud->pq; + hash = fuse_req_hash(req->in.h.unique); spin_lock(&fpq->lock); - list_add_tail(&req->list, fpq->processing); + list_add_tail(&req->list, &fpq->processing[hash]); spin_unlock(&fpq->lock); set_bit(FR_SENT, &req->flags); /* matches barrier in request_wait_answer() */ @@ -1481,8 +1479,7 @@ static void virtio_fs_send_req(struct fuse_iqueue *fiq, struct fuse_req *req) struct virtio_fs_vq *fsvq; int ret; - if (req->in.h.opcode != FUSE_NOTIFY_REPLY) - req->in.h.unique = fuse_get_unique(fiq); + fuse_request_assign_unique(fiq, req); clear_bit(FR_PENDING, &req->flags); |
