summaryrefslogtreecommitdiff
path: root/fs/fuse
diff options
context:
space:
mode:
Diffstat (limited to 'fs/fuse')
-rw-r--r--fs/fuse/Kconfig3
-rw-r--r--fs/fuse/Makefile5
-rw-r--r--fs/fuse/backing.c179
-rw-r--r--fs/fuse/control.c58
-rw-r--r--fs/fuse/cuse.c3
-rw-r--r--fs/fuse/dax.c3
-rw-r--r--fs/fuse/dev.c242
-rw-r--r--fs/fuse/dev_uring.c35
-rw-r--r--fs/fuse/dir.c297
-rw-r--r--fs/fuse/file.c703
-rw-r--r--fs/fuse/fuse_dev_i.h14
-rw-r--r--fs/fuse/fuse_i.h115
-rw-r--r--fs/fuse/inode.c144
-rw-r--r--fs/fuse/ioctl.c4
-rw-r--r--fs/fuse/iomode.c3
-rw-r--r--fs/fuse/passthrough.c162
-rw-r--r--fs/fuse/trace.c13
-rw-r--r--fs/fuse/virtio_fs.c25
18 files changed, 1214 insertions, 794 deletions
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index ca215a3cba3e..3a4ae632c94a 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -2,6 +2,7 @@
config FUSE_FS
tristate "FUSE (Filesystem in Userspace) support"
select FS_POSIX_ACL
+ select FS_IOMAP
help
With FUSE it is possible to implement a fully functional filesystem
in a userspace program.
@@ -12,7 +13,7 @@ config FUSE_FS
although chances are your distribution already has that library
installed if you've installed the "fuse" package itself.
- See <file:Documentation/filesystems/fuse.rst> for more information.
+ See <file:Documentation/filesystems/fuse/fuse.rst> for more information.
See <file:Documentation/Changes> for needed library/utility version.
If you want to develop a userspace FS, or if you want to use
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 3f0f312a31c1..22ad9538dfc4 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -10,10 +10,11 @@ obj-$(CONFIG_FUSE_FS) += fuse.o
obj-$(CONFIG_CUSE) += cuse.o
obj-$(CONFIG_VIRTIO_FS) += virtiofs.o
-fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o
+fuse-y := trace.o # put trace.o first so we see ftrace errors sooner
+fuse-y += dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o
fuse-y += iomode.o
fuse-$(CONFIG_FUSE_DAX) += dax.o
-fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o
+fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o backing.o
fuse-$(CONFIG_SYSCTL) += sysctl.o
fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o
diff --git a/fs/fuse/backing.c b/fs/fuse/backing.c
new file mode 100644
index 000000000000..4afda419dd14
--- /dev/null
+++ b/fs/fuse/backing.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FUSE passthrough to backing file.
+ *
+ * Copyright (c) 2023 CTERA Networks.
+ */
+
+#include "fuse_i.h"
+
+#include <linux/file.h>
+
+struct fuse_backing *fuse_backing_get(struct fuse_backing *fb)
+{
+ if (fb && refcount_inc_not_zero(&fb->count))
+ return fb;
+ return NULL;
+}
+
+static void fuse_backing_free(struct fuse_backing *fb)
+{
+ pr_debug("%s: fb=0x%p\n", __func__, fb);
+
+ if (fb->file)
+ fput(fb->file);
+ put_cred(fb->cred);
+ kfree_rcu(fb, rcu);
+}
+
+void fuse_backing_put(struct fuse_backing *fb)
+{
+ if (fb && refcount_dec_and_test(&fb->count))
+ fuse_backing_free(fb);
+}
+
+void fuse_backing_files_init(struct fuse_conn *fc)
+{
+ idr_init(&fc->backing_files_map);
+}
+
+static int fuse_backing_id_alloc(struct fuse_conn *fc, struct fuse_backing *fb)
+{
+ int id;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock(&fc->lock);
+ /* FIXME: xarray might be space inefficient */
+ id = idr_alloc_cyclic(&fc->backing_files_map, fb, 1, 0, GFP_ATOMIC);
+ spin_unlock(&fc->lock);
+ idr_preload_end();
+
+ WARN_ON_ONCE(id == 0);
+ return id;
+}
+
+static struct fuse_backing *fuse_backing_id_remove(struct fuse_conn *fc,
+ int id)
+{
+ struct fuse_backing *fb;
+
+ spin_lock(&fc->lock);
+ fb = idr_remove(&fc->backing_files_map, id);
+ spin_unlock(&fc->lock);
+
+ return fb;
+}
+
+static int fuse_backing_id_free(int id, void *p, void *data)
+{
+ struct fuse_backing *fb = p;
+
+ WARN_ON_ONCE(refcount_read(&fb->count) != 1);
+ fuse_backing_free(fb);
+ return 0;
+}
+
+void fuse_backing_files_free(struct fuse_conn *fc)
+{
+ idr_for_each(&fc->backing_files_map, fuse_backing_id_free, NULL);
+ idr_destroy(&fc->backing_files_map);
+}
+
+int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map)
+{
+ struct file *file;
+ struct super_block *backing_sb;
+ struct fuse_backing *fb = NULL;
+ int res;
+
+ pr_debug("%s: fd=%d flags=0x%x\n", __func__, map->fd, map->flags);
+
+ /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */
+ res = -EPERM;
+ if (!fc->passthrough || !capable(CAP_SYS_ADMIN))
+ goto out;
+
+ res = -EINVAL;
+ if (map->flags || map->padding)
+ goto out;
+
+ file = fget_raw(map->fd);
+ res = -EBADF;
+ if (!file)
+ goto out;
+
+ /* read/write/splice/mmap passthrough only relevant for regular files */
+ res = d_is_dir(file->f_path.dentry) ? -EISDIR : -EINVAL;
+ if (!d_is_reg(file->f_path.dentry))
+ goto out_fput;
+
+ backing_sb = file_inode(file)->i_sb;
+ res = -ELOOP;
+ if (backing_sb->s_stack_depth >= fc->max_stack_depth)
+ goto out_fput;
+
+ fb = kmalloc(sizeof(struct fuse_backing), GFP_KERNEL);
+ res = -ENOMEM;
+ if (!fb)
+ goto out_fput;
+
+ fb->file = file;
+ fb->cred = prepare_creds();
+ refcount_set(&fb->count, 1);
+
+ res = fuse_backing_id_alloc(fc, fb);
+ if (res < 0) {
+ fuse_backing_free(fb);
+ fb = NULL;
+ }
+
+out:
+ pr_debug("%s: fb=0x%p, ret=%i\n", __func__, fb, res);
+
+ return res;
+
+out_fput:
+ fput(file);
+ goto out;
+}
+
+int fuse_backing_close(struct fuse_conn *fc, int backing_id)
+{
+ struct fuse_backing *fb = NULL;
+ int err;
+
+ pr_debug("%s: backing_id=%d\n", __func__, backing_id);
+
+ /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */
+ err = -EPERM;
+ if (!fc->passthrough || !capable(CAP_SYS_ADMIN))
+ goto out;
+
+ err = -EINVAL;
+ if (backing_id <= 0)
+ goto out;
+
+ err = -ENOENT;
+ fb = fuse_backing_id_remove(fc, backing_id);
+ if (!fb)
+ goto out;
+
+ fuse_backing_put(fb);
+ err = 0;
+out:
+ pr_debug("%s: fb=0x%p, err=%i\n", __func__, fb, err);
+
+ return err;
+}
+
+struct fuse_backing *fuse_backing_lookup(struct fuse_conn *fc, int backing_id)
+{
+ struct fuse_backing *fb;
+
+ rcu_read_lock();
+ fb = idr_find(&fc->backing_files_map, backing_id);
+ fb = fuse_backing_get(fb);
+ rcu_read_unlock();
+
+ return fb;
+}
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 2a730d88cc3b..140bd5730d99 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -11,6 +11,7 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs_context.h>
+#include <linux/namei.h>
#define FUSE_CTL_SUPER_MAGIC 0x65735543
@@ -204,15 +205,13 @@ static const struct file_operations fuse_conn_congestion_threshold_ops = {
static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
struct fuse_conn *fc,
- const char *name,
- int mode, int nlink,
+ const char *name, int mode,
const struct inode_operations *iop,
const struct file_operations *fop)
{
struct dentry *dentry;
struct inode *inode;
- BUG_ON(fc->ctl_ndents >= FUSE_CTL_NUM_DENTRIES);
dentry = d_alloc_name(parent, name);
if (!dentry)
return NULL;
@@ -232,12 +231,19 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
if (iop)
inode->i_op = iop;
inode->i_fop = fop;
- set_nlink(inode, nlink);
+ if (S_ISDIR(mode)) {
+ inc_nlink(d_inode(parent));
+ inc_nlink(inode);
+ }
inode->i_private = fc;
- d_add(dentry, inode);
-
- fc->ctl_dentry[fc->ctl_ndents++] = dentry;
-
+ d_make_persistent(dentry, inode);
+ dput(dentry);
+
+ /*
+ * We are returning a borrowed reference here - it's only good while
+ * fuse_mutex is held. Actually it's d_make_persistent() return
+ * value...
+ */
return dentry;
}
@@ -254,22 +260,21 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
return 0;
parent = fuse_control_sb->s_root;
- inc_nlink(d_inode(parent));
sprintf(name, "%u", fc->dev);
- parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2,
+ parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500,
&simple_dir_inode_operations,
&simple_dir_operations);
if (!parent)
goto err;
- if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1,
+ if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400,
NULL, &fuse_ctl_waiting_ops) ||
- !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1,
+ !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200,
NULL, &fuse_ctl_abort_ops) ||
!fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600,
- 1, NULL, &fuse_conn_max_background_ops) ||
+ NULL, &fuse_conn_max_background_ops) ||
!fuse_ctl_add_dentry(parent, fc, "congestion_threshold",
- S_IFREG | 0600, 1, NULL,
+ S_IFREG | 0600, NULL,
&fuse_conn_congestion_threshold_ops))
goto err;
@@ -280,27 +285,24 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
return -ENOMEM;
}
+static void remove_one(struct dentry *dentry)
+{
+ d_inode(dentry)->i_private = NULL;
+}
+
/*
* Remove a connection from the control filesystem (if it exists).
* Caller must hold fuse_mutex
*/
void fuse_ctl_remove_conn(struct fuse_conn *fc)
{
- int i;
+ char name[32];
if (!fuse_control_sb || fc->no_control)
return;
- for (i = fc->ctl_ndents - 1; i >= 0; i--) {
- struct dentry *dentry = fc->ctl_dentry[i];
- d_inode(dentry)->i_private = NULL;
- if (!i) {
- /* Get rid of submounts: */
- d_invalidate(dentry);
- }
- dput(dentry);
- }
- drop_nlink(d_inode(fuse_control_sb->s_root));
+ sprintf(name, "%u", fc->dev);
+ simple_remove_by_name(fuse_control_sb->s_root, name, remove_one);
}
static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fsc)
@@ -346,15 +348,11 @@ static int fuse_ctl_init_fs_context(struct fs_context *fsc)
static void fuse_ctl_kill_sb(struct super_block *sb)
{
- struct fuse_conn *fc;
-
mutex_lock(&fuse_mutex);
fuse_control_sb = NULL;
- list_for_each_entry(fc, &fuse_conn_list, entry)
- fc->ctl_ndents = 0;
mutex_unlock(&fuse_mutex);
- kill_litter_super(sb);
+ kill_anon_super(sb);
}
static struct file_system_type fuse_ctl_fs_type = {
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index b39844d75a80..28c96961e85d 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -52,6 +52,7 @@
#include <linux/user_namespace.h>
#include "fuse_i.h"
+#include "fuse_dev_i.h"
#define CUSE_CONNTBL_LEN 64
@@ -547,7 +548,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
*/
static int cuse_channel_release(struct inode *inode, struct file *file)
{
- struct fuse_dev *fud = file->private_data;
+ struct fuse_dev *fud = __fuse_get_dev(file);
struct cuse_conn *cc = fc_to_cc(fud->fc);
/* remove from the conntbl, no more access from this point on */
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 0502bf3cdf6a..ac6d4c1064cc 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -10,7 +10,6 @@
#include <linux/dax.h>
#include <linux/uio.h>
#include <linux/pagemap.h>
-#include <linux/pfn_t.h>
#include <linux/iomap.h>
#include <linux/interval_tree.h>
@@ -757,7 +756,7 @@ static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, unsigned int order,
vm_fault_t ret;
struct inode *inode = file_inode(vmf->vma->vm_file);
struct super_block *sb = inode->i_sb;
- pfn_t pfn;
+ unsigned long pfn;
int error = 0;
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_conn_dax *fcd = fc->dax;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index e80cd8f2c049..6d59cbc877c6 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -25,7 +25,6 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
-#define CREATE_TRACE_POINTS
#include "fuse_trace.h"
MODULE_ALIAS_MISCDEV(FUSE_MINOR);
@@ -119,7 +118,7 @@ void fuse_check_timeout(struct work_struct *work)
goto abort_conn;
out:
- queue_delayed_work(system_wq, &fc->timeout.work,
+ queue_delayed_work(system_percpu_wq, &fc->timeout.work,
fuse_timeout_timer_freq);
return;
@@ -207,8 +206,9 @@ static struct fuse_req *fuse_get_req(struct mnt_idmap *idmap,
if (fuse_block_alloc(fc, for_background)) {
err = -EINTR;
- if (wait_event_killable_exclusive(fc->blocked_waitq,
- !fuse_block_alloc(fc, for_background)))
+ if (wait_event_state_exclusive(fc->blocked_waitq,
+ !fuse_block_alloc(fc, for_background),
+ (TASK_KILLABLE | TASK_FREEZABLE)))
goto out;
}
/* Matches smp_wmb() in fuse_set_initialized() */
@@ -322,6 +322,7 @@ unsigned int fuse_req_hash(u64 unique)
{
return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS);
}
+EXPORT_SYMBOL_GPL(fuse_req_hash);
/*
* A new request is available, wake fiq->waitq
@@ -369,12 +370,32 @@ void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
}
}
+static inline void fuse_request_assign_unique_locked(struct fuse_iqueue *fiq,
+ struct fuse_req *req)
+{
+ if (req->in.h.opcode != FUSE_NOTIFY_REPLY)
+ req->in.h.unique = fuse_get_unique_locked(fiq);
+
+ /* tracepoint captures in.h.unique and in.h.len */
+ trace_fuse_request_send(req);
+}
+
+inline void fuse_request_assign_unique(struct fuse_iqueue *fiq,
+ struct fuse_req *req)
+{
+ if (req->in.h.opcode != FUSE_NOTIFY_REPLY)
+ req->in.h.unique = fuse_get_unique(fiq);
+
+ /* tracepoint captures in.h.unique and in.h.len */
+ trace_fuse_request_send(req);
+}
+EXPORT_SYMBOL_GPL(fuse_request_assign_unique);
+
static void fuse_dev_queue_req(struct fuse_iqueue *fiq, struct fuse_req *req)
{
spin_lock(&fiq->lock);
if (fiq->connected) {
- if (req->in.h.opcode != FUSE_NOTIFY_REPLY)
- req->in.h.unique = fuse_get_unique_locked(fiq);
+ fuse_request_assign_unique_locked(fiq, req);
list_add_tail(&req->list, &fiq->pending);
fuse_dev_wake_and_unlock(fiq);
} else {
@@ -397,7 +418,6 @@ static void fuse_send_one(struct fuse_iqueue *fiq, struct fuse_req *req)
req->in.h.len = sizeof(struct fuse_in_header) +
fuse_len_args(req->args->in_numargs,
(struct fuse_arg *) req->args->in_args);
- trace_fuse_request_send(req);
fiq->ops->send_req(fiq, req);
}
@@ -687,10 +707,10 @@ static bool fuse_request_queue_background_uring(struct fuse_conn *fc,
{
struct fuse_iqueue *fiq = &fc->iq;
- req->in.h.unique = fuse_get_unique(fiq);
req->in.h.len = sizeof(struct fuse_in_header) +
fuse_len_args(req->args->in_numargs,
(struct fuse_arg *) req->args->in_args);
+ fuse_request_assign_unique(fiq, req);
return fuse_uring_queue_bq_req(req);
}
@@ -826,7 +846,7 @@ void fuse_copy_init(struct fuse_copy_state *cs, bool write,
}
/* Unmap and put previous page of userspace buffer */
-static void fuse_copy_finish(struct fuse_copy_state *cs)
+void fuse_copy_finish(struct fuse_copy_state *cs)
{
if (cs->currbuf) {
struct pipe_buffer *buf = cs->currbuf;
@@ -935,7 +955,7 @@ static int fuse_check_folio(struct folio *folio)
{
if (folio_mapped(folio) ||
folio->mapping != NULL ||
- (folio->flags & PAGE_FLAGS_CHECK_AT_PREP &
+ (folio->flags.f & PAGE_FLAGS_CHECK_AT_PREP &
~(1 << PG_locked |
1 << PG_referenced |
1 << PG_lru |
@@ -1528,14 +1548,34 @@ static int fuse_dev_open(struct inode *inode, struct file *file)
return 0;
}
+struct fuse_dev *fuse_get_dev(struct file *file)
+{
+ struct fuse_dev *fud = __fuse_get_dev(file);
+ int err;
+
+ if (likely(fud))
+ return fud;
+
+ err = wait_event_interruptible(fuse_dev_waitq,
+ READ_ONCE(file->private_data) != FUSE_DEV_SYNC_INIT);
+ if (err)
+ return ERR_PTR(err);
+
+ fud = __fuse_get_dev(file);
+ if (!fud)
+ return ERR_PTR(-EPERM);
+
+ return fud;
+}
+
static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to)
{
struct fuse_copy_state cs;
struct file *file = iocb->ki_filp;
struct fuse_dev *fud = fuse_get_dev(file);
- if (!fud)
- return -EPERM;
+ if (IS_ERR(fud))
+ return PTR_ERR(fud);
if (!user_backed_iter(to))
return -EINVAL;
@@ -1555,8 +1595,8 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
struct fuse_copy_state cs;
struct fuse_dev *fud = fuse_get_dev(in);
- if (!fud)
- return -EPERM;
+ if (IS_ERR(fud))
+ return PTR_ERR(fud);
bufs = kvmalloc_array(pipe->max_usage, sizeof(struct pipe_buffer),
GFP_KERNEL);
@@ -1600,35 +1640,31 @@ static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
struct fuse_copy_state *cs)
{
struct fuse_notify_poll_wakeup_out outarg;
- int err = -EINVAL;
+ int err;
if (size != sizeof(outarg))
- goto err;
+ return -EINVAL;
err = fuse_copy_one(cs, &outarg, sizeof(outarg));
if (err)
- goto err;
+ return err;
fuse_copy_finish(cs);
return fuse_notify_poll_wakeup(fc, &outarg);
-
-err:
- fuse_copy_finish(cs);
- return err;
}
static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
struct fuse_copy_state *cs)
{
struct fuse_notify_inval_inode_out outarg;
- int err = -EINVAL;
+ int err;
if (size != sizeof(outarg))
- goto err;
+ return -EINVAL;
err = fuse_copy_one(cs, &outarg, sizeof(outarg));
if (err)
- goto err;
+ return err;
fuse_copy_finish(cs);
down_read(&fc->killsb);
@@ -1636,10 +1672,6 @@ static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
outarg.off, outarg.len);
up_read(&fc->killsb);
return err;
-
-err:
- fuse_copy_finish(cs);
- return err;
}
static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
@@ -1647,29 +1679,26 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
{
struct fuse_notify_inval_entry_out outarg;
int err;
- char *buf = NULL;
+ char *buf;
struct qstr name;
- err = -EINVAL;
if (size < sizeof(outarg))
- goto err;
+ return -EINVAL;
err = fuse_copy_one(cs, &outarg, sizeof(outarg));
if (err)
- goto err;
+ return err;
- err = -ENAMETOOLONG;
if (outarg.namelen > fc->name_max)
- goto err;
+ return -ENAMETOOLONG;
err = -EINVAL;
if (size != sizeof(outarg) + outarg.namelen + 1)
- goto err;
+ return -EINVAL;
- err = -ENOMEM;
buf = kzalloc(outarg.namelen + 1, GFP_KERNEL);
if (!buf)
- goto err;
+ return -ENOMEM;
name.name = buf;
name.len = outarg.namelen;
@@ -1682,12 +1711,8 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
down_read(&fc->killsb);
err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name, outarg.flags);
up_read(&fc->killsb);
- kfree(buf);
- return err;
-
err:
kfree(buf);
- fuse_copy_finish(cs);
return err;
}
@@ -1696,29 +1721,25 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
{
struct fuse_notify_delete_out outarg;
int err;
- char *buf = NULL;
+ char *buf;
struct qstr name;
- err = -EINVAL;
if (size < sizeof(outarg))
- goto err;
+ return -EINVAL;
err = fuse_copy_one(cs, &outarg, sizeof(outarg));
if (err)
- goto err;
+ return err;
- err = -ENAMETOOLONG;
if (outarg.namelen > fc->name_max)
- goto err;
+ return -ENAMETOOLONG;
- err = -EINVAL;
if (size != sizeof(outarg) + outarg.namelen + 1)
- goto err;
+ return -EINVAL;
- err = -ENOMEM;
buf = kzalloc(outarg.namelen + 1, GFP_KERNEL);
if (!buf)
- goto err;
+ return -ENOMEM;
name.name = buf;
name.len = outarg.namelen;
@@ -1731,12 +1752,8 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
down_read(&fc->killsb);
err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name, 0);
up_read(&fc->killsb);
- kfree(buf);
- return err;
-
err:
kfree(buf);
- fuse_copy_finish(cs);
return err;
}
@@ -1754,17 +1771,15 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
loff_t file_size;
loff_t end;
- err = -EINVAL;
if (size < sizeof(outarg))
- goto out_finish;
+ return -EINVAL;
err = fuse_copy_one(cs, &outarg, sizeof(outarg));
if (err)
- goto out_finish;
+ return err;
- err = -EINVAL;
if (size - sizeof(outarg) != outarg.size)
- goto out_finish;
+ return -EINVAL;
nodeid = outarg.nodeid;
@@ -1824,8 +1839,6 @@ out_iput:
iput(inode);
out_up_killsb:
up_read(&fc->killsb);
-out_finish:
- fuse_copy_finish(cs);
return err;
}
@@ -1893,7 +1906,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
index = outarg->offset >> PAGE_SHIFT;
- while (num) {
+ while (num && ap->num_folios < num_pages) {
struct folio *folio;
unsigned int folio_offset;
unsigned int nr_bytes;
@@ -1940,13 +1953,12 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
u64 nodeid;
int err;
- err = -EINVAL;
if (size != sizeof(outarg))
- goto copy_finish;
+ return -EINVAL;
err = fuse_copy_one(cs, &outarg, sizeof(outarg));
if (err)
- goto copy_finish;
+ return err;
fuse_copy_finish(cs);
@@ -1962,10 +1974,6 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
up_read(&fc->killsb);
return err;
-
-copy_finish:
- fuse_copy_finish(cs);
- return err;
}
/*
@@ -2033,17 +2041,54 @@ static int fuse_notify_resend(struct fuse_conn *fc)
/*
* Increments the fuse connection epoch. This will result of dentries from
- * previous epochs to be invalidated.
- *
- * XXX optimization: add call to shrink_dcache_sb()?
+ * previous epochs to be invalidated. Additionally, if inval_wq is set, a work
+ * queue is scheduled to trigger the invalidation.
*/
static int fuse_notify_inc_epoch(struct fuse_conn *fc)
{
atomic_inc(&fc->epoch);
+ if (inval_wq)
+ schedule_work(&fc->epoch_work);
return 0;
}
+static int fuse_notify_prune(struct fuse_conn *fc, unsigned int size,
+ struct fuse_copy_state *cs)
+{
+ struct fuse_notify_prune_out outarg;
+ const unsigned int batch = 512;
+ u64 *nodeids __free(kfree) = kmalloc(sizeof(u64) * batch, GFP_KERNEL);
+ unsigned int num, i;
+ int err;
+
+ if (!nodeids)
+ return -ENOMEM;
+
+ if (size < sizeof(outarg))
+ return -EINVAL;
+
+ err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+ if (err)
+ return err;
+
+ if (size - sizeof(outarg) != outarg.count * sizeof(u64))
+ return -EINVAL;
+
+ for (; outarg.count; outarg.count -= num) {
+ num = min(batch, outarg.count);
+ err = fuse_copy_one(cs, nodeids, num * sizeof(u64));
+ if (err)
+ return err;
+
+ scoped_guard(rwsem_read, &fc->killsb) {
+ for (i = 0; i < num; i++)
+ fuse_try_prune_one_inode(fc, nodeids[i]);
+ }
+ }
+ return 0;
+}
+
static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
unsigned int size, struct fuse_copy_state *cs)
{
@@ -2075,8 +2120,10 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
case FUSE_NOTIFY_INC_EPOCH:
return fuse_notify_inc_epoch(fc);
+ case FUSE_NOTIFY_PRUNE:
+ return fuse_notify_prune(fc, size, cs);
+
default:
- fuse_copy_finish(cs);
return -EINVAL;
}
}
@@ -2156,7 +2203,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
*/
if (!oh.unique) {
err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
- goto out;
+ goto copy_finish;
}
err = -EINVAL;
@@ -2229,7 +2276,7 @@ copy_finish:
static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from)
{
struct fuse_copy_state cs;
- struct fuse_dev *fud = fuse_get_dev(iocb->ki_filp);
+ struct fuse_dev *fud = __fuse_get_dev(iocb->ki_filp);
if (!fud)
return -EPERM;
@@ -2251,11 +2298,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
unsigned idx;
struct pipe_buffer *bufs;
struct fuse_copy_state cs;
- struct fuse_dev *fud;
+ struct fuse_dev *fud = __fuse_get_dev(out);
size_t rem;
ssize_t ret;
- fud = fuse_get_dev(out);
if (!fud)
return -EPERM;
@@ -2341,7 +2387,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait)
struct fuse_iqueue *fiq;
struct fuse_dev *fud = fuse_get_dev(file);
- if (!fud)
+ if (IS_ERR(fud))
return EPOLLERR;
fiq = &fud->fc->iq;
@@ -2394,7 +2440,7 @@ static void end_polls(struct fuse_conn *fc)
* The same effect is usually achievable through killing the filesystem daemon
* and all users of the filesystem. The exception is the combination of an
* asynchronous request and the tricky deadlock (see
- * Documentation/filesystems/fuse.rst).
+ * Documentation/filesystems/fuse/fuse.rst).
*
* Aborting requests under I/O goes as follows: 1: Separate out unlocked
* requests, they should be finished off immediately. Locked requests will be
@@ -2488,7 +2534,7 @@ void fuse_wait_aborted(struct fuse_conn *fc)
int fuse_dev_release(struct inode *inode, struct file *file)
{
- struct fuse_dev *fud = fuse_get_dev(file);
+ struct fuse_dev *fud = __fuse_get_dev(file);
if (fud) {
struct fuse_conn *fc = fud->fc;
@@ -2519,8 +2565,8 @@ static int fuse_dev_fasync(int fd, struct file *file, int on)
{
struct fuse_dev *fud = fuse_get_dev(file);
- if (!fud)
- return -EPERM;
+ if (IS_ERR(fud))
+ return PTR_ERR(fud);
/* No locking - fasync_helper does its own locking */
return fasync_helper(fd, file, on, &fud->fc->iq.fasync);
@@ -2530,7 +2576,7 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new)
{
struct fuse_dev *fud;
- if (new->private_data)
+ if (__fuse_get_dev(new))
return -EINVAL;
fud = fuse_dev_alloc_install(fc);
@@ -2561,7 +2607,7 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp)
* uses the same ioctl handler.
*/
if (fd_file(f)->f_op == file->f_op)
- fud = fuse_get_dev(fd_file(f));
+ fud = __fuse_get_dev(fd_file(f));
res = -EINVAL;
if (fud) {
@@ -2579,8 +2625,8 @@ static long fuse_dev_ioctl_backing_open(struct file *file,
struct fuse_dev *fud = fuse_get_dev(file);
struct fuse_backing_map map;
- if (!fud)
- return -EPERM;
+ if (IS_ERR(fud))
+ return PTR_ERR(fud);
if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
return -EOPNOTSUPP;
@@ -2596,8 +2642,8 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp)
struct fuse_dev *fud = fuse_get_dev(file);
int backing_id;
- if (!fud)
- return -EPERM;
+ if (IS_ERR(fud))
+ return PTR_ERR(fud);
if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
return -EOPNOTSUPP;
@@ -2608,6 +2654,19 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp)
return fuse_backing_close(fud->fc, backing_id);
}
+static long fuse_dev_ioctl_sync_init(struct file *file)
+{
+ int err = -EINVAL;
+
+ mutex_lock(&fuse_mutex);
+ if (!__fuse_get_dev(file)) {
+ WRITE_ONCE(file->private_data, FUSE_DEV_SYNC_INIT);
+ err = 0;
+ }
+ mutex_unlock(&fuse_mutex);
+ return err;
+}
+
static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
@@ -2623,6 +2682,9 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
case FUSE_DEV_IOC_BACKING_CLOSE:
return fuse_dev_ioctl_backing_close(file, argp);
+ case FUSE_DEV_IOC_SYNC_INIT:
+ return fuse_dev_ioctl_sync_init(file);
+
default:
return -ENOTTY;
}
@@ -2631,7 +2693,7 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
#ifdef CONFIG_PROC_FS
static void fuse_dev_show_fdinfo(struct seq_file *seq, struct file *file)
{
- struct fuse_dev *fud = fuse_get_dev(file);
+ struct fuse_dev *fud = __fuse_get_dev(file);
if (!fud)
return;
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index 249b210becb1..5ceb217ced1b 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -7,6 +7,7 @@
#include "fuse_i.h"
#include "dev_uring_i.h"
#include "fuse_dev_i.h"
+#include "fuse_trace.h"
#include <linux/fs.h>
#include <linux/io_uring/cmd.h>
@@ -85,6 +86,7 @@ static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req,
lockdep_assert_not_held(&queue->lock);
spin_lock(&queue->lock);
ent->fuse_req = NULL;
+ list_del_init(&req->list);
if (test_bit(FR_BACKGROUND, &req->flags)) {
queue->active_background--;
spin_lock(&fc->bg_lock);
@@ -351,7 +353,7 @@ static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent)
spin_unlock(&queue->lock);
if (cmd)
- io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED);
+ io_uring_cmd_done(cmd, -ENOTCONN, IO_URING_F_UNLOCKED);
if (req)
fuse_uring_stop_fuse_req_end(req);
@@ -518,7 +520,7 @@ static void fuse_uring_cancel(struct io_uring_cmd *cmd,
if (need_cmd_done) {
/* no queue lock to avoid lock order issues */
- io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags);
+ io_uring_cmd_done(cmd, -ENOTCONN, issue_flags);
}
}
@@ -597,12 +599,14 @@ static int fuse_uring_copy_from_ring(struct fuse_ring *ring,
cs.is_uring = true;
cs.req = req;
- return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz);
+ err = fuse_copy_out_args(&cs, args, ring_in_out.payload_sz);
+ fuse_copy_finish(&cs);
+ return err;
}
- /*
- * Copy data from the req to the ring buffer
- */
+/*
+ * Copy data from the req to the ring buffer
+ */
static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req,
struct fuse_ring_ent *ent)
{
@@ -648,6 +652,7 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req,
/* copy the payload */
err = fuse_copy_args(&cs, num_args, args->in_pages,
(struct fuse_arg *)in_args, 0);
+ fuse_copy_finish(&cs);
if (err) {
pr_info_ratelimited("%s fuse_copy_args failed\n", __func__);
return err;
@@ -733,7 +738,7 @@ static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent,
list_move_tail(&ent->list, &queue->ent_in_userspace);
spin_unlock(&queue->lock);
- io_uring_cmd_done(cmd, 0, 0, issue_flags);
+ io_uring_cmd_done(cmd, 0, issue_flags);
return 0;
}
@@ -1139,9 +1144,9 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
return -EINVAL;
fud = fuse_get_dev(cmd->file);
- if (!fud) {
+ if (IS_ERR(fud)) {
pr_info_ratelimited("No fuse device found\n");
- return -ENOTCONN;
+ return PTR_ERR(fud);
}
fc = fud->fc;
@@ -1200,7 +1205,7 @@ static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd,
ent->cmd = NULL;
spin_unlock(&queue->lock);
- io_uring_cmd_done(cmd, ret, 0, issue_flags);
+ io_uring_cmd_done(cmd, ret, issue_flags);
}
/*
@@ -1208,14 +1213,15 @@ static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd,
* User buffers are not mapped yet - the application does not have permission
* to write to it - this has to be executed in ring task context.
*/
-static void fuse_uring_send_in_task(struct io_uring_cmd *cmd,
- unsigned int issue_flags)
+static void fuse_uring_send_in_task(struct io_tw_req tw_req, io_tw_token_t tw)
{
+ unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
+ struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd);
struct fuse_ring_queue *queue = ent->queue;
int err;
- if (!(issue_flags & IO_URING_F_TASK_DEAD)) {
+ if (!tw.cancel) {
err = fuse_uring_prepare_send(ent, ent->fuse_req);
if (err) {
fuse_uring_next_fuse_req(ent, queue, issue_flags);
@@ -1268,8 +1274,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req)
if (!queue)
goto err;
- if (req->in.h.opcode != FUSE_NOTIFY_REPLY)
- req->in.h.unique = fuse_get_unique(fiq);
+ fuse_request_assign_unique(fiq, req);
spin_lock(&queue->lock);
err = -ENOTCONN;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 45b4c3cc1396..4b6b3d2758ff 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -27,6 +27,67 @@ module_param(allow_sys_admin_access, bool, 0644);
MODULE_PARM_DESC(allow_sys_admin_access,
"Allow users with CAP_SYS_ADMIN in initial userns to bypass allow_other access check");
+struct dentry_bucket {
+ struct rb_root tree;
+ spinlock_t lock;
+};
+
+#define HASH_BITS 5
+#define HASH_SIZE (1 << HASH_BITS)
+static struct dentry_bucket dentry_hash[HASH_SIZE];
+struct delayed_work dentry_tree_work;
+
+/* Minimum invalidation work queue frequency */
+#define FUSE_DENTRY_INVAL_FREQ_MIN 5
+
+unsigned __read_mostly inval_wq;
+static int inval_wq_set(const char *val, const struct kernel_param *kp)
+{
+ unsigned int num;
+ unsigned int old = inval_wq;
+ int ret;
+
+ if (!val)
+ return -EINVAL;
+
+ ret = kstrtouint(val, 0, &num);
+ if (ret)
+ return ret;
+
+ if ((num < FUSE_DENTRY_INVAL_FREQ_MIN) && (num != 0))
+ return -EINVAL;
+
+ /* This should prevent overflow in secs_to_jiffies() */
+ if (num > USHRT_MAX)
+ return -EINVAL;
+
+ *((unsigned int *)kp->arg) = num;
+
+ if (num && !old)
+ schedule_delayed_work(&dentry_tree_work,
+ secs_to_jiffies(num));
+ else if (!num && old)
+ cancel_delayed_work_sync(&dentry_tree_work);
+
+ return 0;
+}
+static const struct kernel_param_ops inval_wq_ops = {
+ .set = inval_wq_set,
+ .get = param_get_uint,
+};
+module_param_cb(inval_wq, &inval_wq_ops, &inval_wq, 0644);
+__MODULE_PARM_TYPE(inval_wq, "uint");
+MODULE_PARM_DESC(inval_wq,
+ "Dentries invalidation work queue period in secs (>= "
+ __stringify(FUSE_DENTRY_INVAL_FREQ_MIN) ").");
+
+static inline struct dentry_bucket *get_dentry_bucket(struct dentry *dentry)
+{
+ int i = hash_ptr(dentry, HASH_BITS);
+
+ return &dentry_hash[i];
+}
+
static void fuse_advise_use_readdirplus(struct inode *dir)
{
struct fuse_inode *fi = get_fuse_inode(dir);
@@ -34,33 +95,151 @@ static void fuse_advise_use_readdirplus(struct inode *dir)
set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state);
}
-#if BITS_PER_LONG >= 64
-static inline void __fuse_dentry_settime(struct dentry *entry, u64 time)
+struct fuse_dentry {
+ u64 time;
+ union {
+ struct rcu_head rcu;
+ struct rb_node node;
+ };
+ struct dentry *dentry;
+};
+
+static void __fuse_dentry_tree_del_node(struct fuse_dentry *fd,
+ struct dentry_bucket *bucket)
{
- entry->d_fsdata = (void *) time;
+ if (!RB_EMPTY_NODE(&fd->node)) {
+ rb_erase(&fd->node, &bucket->tree);
+ RB_CLEAR_NODE(&fd->node);
+ }
}
-static inline u64 fuse_dentry_time(const struct dentry *entry)
+static void fuse_dentry_tree_del_node(struct dentry *dentry)
{
- return (u64)entry->d_fsdata;
+ struct fuse_dentry *fd = dentry->d_fsdata;
+ struct dentry_bucket *bucket = get_dentry_bucket(dentry);
+
+ spin_lock(&bucket->lock);
+ __fuse_dentry_tree_del_node(fd, bucket);
+ spin_unlock(&bucket->lock);
}
-#else
-union fuse_dentry {
- u64 time;
- struct rcu_head rcu;
-};
+static void fuse_dentry_tree_add_node(struct dentry *dentry)
+{
+ struct fuse_dentry *fd = dentry->d_fsdata;
+ struct dentry_bucket *bucket;
+ struct fuse_dentry *cur;
+ struct rb_node **p, *parent = NULL;
+
+ if (!inval_wq)
+ return;
+
+ bucket = get_dentry_bucket(dentry);
+
+ spin_lock(&bucket->lock);
+
+ __fuse_dentry_tree_del_node(fd, bucket);
+
+ p = &bucket->tree.rb_node;
+ while (*p) {
+ parent = *p;
+ cur = rb_entry(*p, struct fuse_dentry, node);
+ if (fd->time < cur->time)
+ p = &(*p)->rb_left;
+ else
+ p = &(*p)->rb_right;
+ }
+ rb_link_node(&fd->node, parent, p);
+ rb_insert_color(&fd->node, &bucket->tree);
+ spin_unlock(&bucket->lock);
+}
+
+/*
+ * work queue which, when enabled, will periodically check for expired dentries
+ * in the dentries tree.
+ */
+static void fuse_dentry_tree_work(struct work_struct *work)
+{
+ LIST_HEAD(dispose);
+ struct fuse_dentry *fd;
+ struct rb_node *node;
+ int i;
+
+ for (i = 0; i < HASH_SIZE; i++) {
+ spin_lock(&dentry_hash[i].lock);
+ node = rb_first(&dentry_hash[i].tree);
+ while (node) {
+ fd = rb_entry(node, struct fuse_dentry, node);
+ if (time_after64(get_jiffies_64(), fd->time)) {
+ rb_erase(&fd->node, &dentry_hash[i].tree);
+ RB_CLEAR_NODE(&fd->node);
+ spin_unlock(&dentry_hash[i].lock);
+ d_dispose_if_unused(fd->dentry, &dispose);
+ cond_resched();
+ spin_lock(&dentry_hash[i].lock);
+ } else
+ break;
+ node = rb_first(&dentry_hash[i].tree);
+ }
+ spin_unlock(&dentry_hash[i].lock);
+ shrink_dentry_list(&dispose);
+ }
+
+ if (inval_wq)
+ schedule_delayed_work(&dentry_tree_work,
+ secs_to_jiffies(inval_wq));
+}
+
+void fuse_epoch_work(struct work_struct *work)
+{
+ struct fuse_conn *fc = container_of(work, struct fuse_conn,
+ epoch_work);
+ struct fuse_mount *fm;
+ struct inode *inode;
+
+ down_read(&fc->killsb);
+
+ inode = fuse_ilookup(fc, FUSE_ROOT_ID, &fm);
+ if (inode) {
+ iput(inode);
+ /* Remove all possible active references to cached inodes */
+ shrink_dcache_sb(fm->sb);
+ } else
+ pr_warn("Failed to get root inode");
+
+ up_read(&fc->killsb);
+}
+
+void fuse_dentry_tree_init(void)
+{
+ int i;
+
+ for (i = 0; i < HASH_SIZE; i++) {
+ spin_lock_init(&dentry_hash[i].lock);
+ dentry_hash[i].tree = RB_ROOT;
+ }
+ INIT_DELAYED_WORK(&dentry_tree_work, fuse_dentry_tree_work);
+}
+
+void fuse_dentry_tree_cleanup(void)
+{
+ int i;
+
+ inval_wq = 0;
+ cancel_delayed_work_sync(&dentry_tree_work);
+
+ for (i = 0; i < HASH_SIZE; i++)
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&dentry_hash[i].tree));
+}
static inline void __fuse_dentry_settime(struct dentry *dentry, u64 time)
{
- ((union fuse_dentry *) dentry->d_fsdata)->time = time;
+ ((struct fuse_dentry *) dentry->d_fsdata)->time = time;
}
static inline u64 fuse_dentry_time(const struct dentry *entry)
{
- return ((union fuse_dentry *) entry->d_fsdata)->time;
+ return ((struct fuse_dentry *) entry->d_fsdata)->time;
}
-#endif
static void fuse_dentry_settime(struct dentry *dentry, u64 time)
{
@@ -81,6 +260,7 @@ static void fuse_dentry_settime(struct dentry *dentry, u64 time)
}
__fuse_dentry_settime(dentry, time);
+ fuse_dentry_tree_add_node(dentry);
}
/*
@@ -283,21 +463,36 @@ invalid:
goto out;
}
-#if BITS_PER_LONG < 64
static int fuse_dentry_init(struct dentry *dentry)
{
- dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry),
- GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE);
+ struct fuse_dentry *fd;
+
+ fd = kzalloc(sizeof(struct fuse_dentry),
+ GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE);
+ if (!fd)
+ return -ENOMEM;
+
+ fd->dentry = dentry;
+ RB_CLEAR_NODE(&fd->node);
+ dentry->d_fsdata = fd;
+
+ return 0;
+}
+
+static void fuse_dentry_prune(struct dentry *dentry)
+{
+ struct fuse_dentry *fd = dentry->d_fsdata;
- return dentry->d_fsdata ? 0 : -ENOMEM;
+ if (!RB_EMPTY_NODE(&fd->node))
+ fuse_dentry_tree_del_node(dentry);
}
+
static void fuse_dentry_release(struct dentry *dentry)
{
- union fuse_dentry *fd = dentry->d_fsdata;
+ struct fuse_dentry *fd = dentry->d_fsdata;
kfree_rcu(fd, rcu);
}
-#endif
static int fuse_dentry_delete(const struct dentry *dentry)
{
@@ -331,20 +526,12 @@ static struct vfsmount *fuse_dentry_automount(struct path *path)
const struct dentry_operations fuse_dentry_operations = {
.d_revalidate = fuse_dentry_revalidate,
.d_delete = fuse_dentry_delete,
-#if BITS_PER_LONG < 64
.d_init = fuse_dentry_init,
+ .d_prune = fuse_dentry_prune,
.d_release = fuse_dentry_release,
-#endif
.d_automount = fuse_dentry_automount,
};
-const struct dentry_operations fuse_root_dentry_operations = {
-#if BITS_PER_LONG < 64
- .d_init = fuse_dentry_init,
- .d_release = fuse_dentry_release,
-#endif
-};
-
int fuse_valid_type(int m)
{
return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) ||
@@ -478,7 +665,7 @@ static int get_security_context(struct dentry *entry, umode_t mode,
u32 total_len = sizeof(*header);
int err, nr_ctx = 0;
const char *name = NULL;
- size_t namelen;
+ size_t namesize;
err = security_dentry_init_security(entry, mode, &entry->d_name,
&name, &lsmctx);
@@ -489,12 +676,12 @@ static int get_security_context(struct dentry *entry, umode_t mode,
if (lsmctx.len) {
nr_ctx = 1;
- namelen = strlen(name) + 1;
+ namesize = strlen(name) + 1;
err = -EIO;
- if (WARN_ON(namelen > XATTR_NAME_MAX + 1 ||
+ if (WARN_ON(namesize > XATTR_NAME_MAX + 1 ||
lsmctx.len > S32_MAX))
goto out_err;
- total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namelen +
+ total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namesize +
lsmctx.len);
}
@@ -511,8 +698,8 @@ static int get_security_context(struct dentry *entry, umode_t mode,
fctx->size = lsmctx.len;
ptr += sizeof(*fctx);
- strcpy(ptr, name);
- ptr += namelen;
+ strscpy(ptr, name, namesize);
+ ptr += namesize;
memcpy(ptr, lsmctx.context, lsmctx.len);
}
@@ -746,22 +933,18 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
int err;
struct mnt_idmap *idmap = file_mnt_idmap(file);
struct fuse_conn *fc = get_fuse_conn(dir);
- struct dentry *res = NULL;
if (fuse_is_bad(dir))
return -EIO;
if (d_in_lookup(entry)) {
- res = fuse_lookup(dir, entry, 0);
- if (IS_ERR(res))
- return PTR_ERR(res);
-
- if (res)
- entry = res;
+ struct dentry *res = fuse_lookup(dir, entry, 0);
+ if (res || d_really_is_positive(entry))
+ return finish_no_open(file, res);
}
- if (!(flags & O_CREAT) || d_really_is_positive(entry))
- goto no_open;
+ if (!(flags & O_CREAT))
+ return finish_no_open(file, NULL);
/* Only creates */
file->f_mode |= FMODE_CREATED;
@@ -775,16 +958,13 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
goto mknod;
} else if (err == -EEXIST)
fuse_invalidate_entry(entry);
-out_dput:
- dput(res);
return err;
mknod:
err = fuse_mknod(idmap, dir, entry, mode, 0);
if (err)
- goto out_dput;
-no_open:
- return finish_no_open(file, res);
+ return err;
+ return finish_no_open(file, NULL);
}
/*
@@ -1384,6 +1564,7 @@ retry:
generic_fillattr(idmap, request_mask, inode, stat);
stat->mode = fi->orig_i_mode;
stat->ino = fi->orig_ino;
+ stat->blksize = 1 << fi->cached_i_blkbits;
if (test_bit(FUSE_I_BTIME, &fi->state)) {
stat->btime = fi->i_btime;
stat->result_mask |= STATX_BTIME;
@@ -1410,27 +1591,25 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
if (!parent)
return -ENOENT;
- inode_lock_nested(parent, I_MUTEX_PARENT);
if (!S_ISDIR(parent->i_mode))
- goto unlock;
+ goto put_parent;
err = -ENOENT;
dir = d_find_alias(parent);
if (!dir)
- goto unlock;
+ goto put_parent;
- name->hash = full_name_hash(dir, name->name, name->len);
- entry = d_lookup(dir, name);
+ entry = start_removing_noperm(dir, name);
dput(dir);
- if (!entry)
- goto unlock;
+ if (IS_ERR(entry))
+ goto put_parent;
fuse_dir_changed(parent);
if (!(flags & FUSE_EXPIRE_ONLY))
d_invalidate(entry);
fuse_invalidate_entry_cache(entry);
- if (child_nodeid != 0 && d_really_is_positive(entry)) {
+ if (child_nodeid != 0) {
inode_lock(d_inode(entry));
if (get_node_id(d_inode(entry)) != child_nodeid) {
err = -ENOENT;
@@ -1458,10 +1637,9 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
} else {
err = 0;
}
- dput(entry);
- unlock:
- inode_unlock(parent);
+ end_removing(entry);
+ put_parent:
iput(parent);
return err;
}
@@ -2243,6 +2421,7 @@ static const struct file_operations fuse_dir_operations = {
.fsync = fuse_dir_fsync,
.unlocked_ioctl = fuse_dir_ioctl,
.compat_ioctl = fuse_dir_compat_ioctl,
+ .setlease = simple_nosetlease,
};
static const struct inode_operations fuse_common_inode_operations = {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f102afc03359..01bc894e9c2b 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -21,6 +21,7 @@
#include <linux/filelock.h>
#include <linux/splice.h>
#include <linux/task_io_accounting_ops.h>
+#include <linux/iomap.h>
static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
unsigned int open_flags, int opcode,
@@ -109,7 +110,9 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
fuse_file_io_release(ff, ra->inode);
if (!args) {
- /* Do nothing when server does not implement 'open' */
+ /* Do nothing when server does not implement 'opendir' */
+ } else if (args->opcode == FUSE_RELEASE && ff->fm->fc->no_open) {
+ fuse_release_end(ff->fm, args, 0);
} else if (sync) {
fuse_simple_request(ff->fm, args);
fuse_release_end(ff->fm, args, 0);
@@ -130,8 +133,17 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
struct fuse_file *ff;
int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
bool open = isdir ? !fc->no_opendir : !fc->no_open;
+ bool release = !isdir || open;
- ff = fuse_file_alloc(fm, open);
+ /*
+ * ff->args->release_args still needs to be allocated (so we can hold an
+ * inode reference while there are pending inflight file operations when
+ * ->release() is called, see fuse_prepare_release()) even if
+ * fc->no_open is set else it becomes possible for reclaim to deadlock
+ * if while servicing the readahead request the server triggers reclaim
+ * and reclaim evicts the inode of the file being read ahead.
+ */
+ ff = fuse_file_alloc(fm, release);
if (!ff)
return ERR_PTR(-ENOMEM);
@@ -151,13 +163,14 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
fuse_file_free(ff);
return ERR_PTR(err);
} else {
- /* No release needed */
- kfree(ff->args);
- ff->args = NULL;
- if (isdir)
+ if (isdir) {
+ /* No release needed */
+ kfree(ff->args);
+ ff->args = NULL;
fc->no_opendir = 1;
- else
+ } else {
fc->no_open = 1;
+ }
}
}
@@ -355,8 +368,14 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff,
* Make the release synchronous if this is a fuseblk mount,
* synchronous RELEASE is allowed (and desirable) in this case
* because the server can be trusted not to screw up.
+ *
+ * Always use the asynchronous file put because the current thread
+ * might be the fuse server. This can happen if a process starts some
+ * aio and closes the fd before the aio completes. Since aio takes its
+ * own ref to the file, the IO completion has to drop the ref, which is
+ * how the fuse server can end up closing its clients' files.
*/
- fuse_file_put(ff, ff->fm->fc->destroy);
+ fuse_file_put(ff, false);
}
void fuse_release_common(struct file *file, bool isdir)
@@ -788,12 +807,16 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
}
}
-static int fuse_do_readfolio(struct file *file, struct folio *folio)
+static int fuse_do_readfolio(struct file *file, struct folio *folio,
+ size_t off, size_t len)
{
struct inode *inode = folio->mapping->host;
struct fuse_mount *fm = get_fuse_mount(inode);
- loff_t pos = folio_pos(folio);
- struct fuse_folio_desc desc = { .length = folio_size(folio) };
+ loff_t pos = folio_pos(folio) + off;
+ struct fuse_folio_desc desc = {
+ .offset = off,
+ .length = len,
+ };
struct fuse_io_args ia = {
.ap.args.page_zeroing = true,
.ap.args.out_pages = true,
@@ -820,25 +843,155 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio)
if (res < desc.length)
fuse_short_read(inode, attr_ver, res, &ia.ap);
- folio_mark_uptodate(folio);
+ return 0;
+}
+
+static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ iomap->type = IOMAP_MAPPED;
+ iomap->length = length;
+ iomap->offset = offset;
+ return 0;
+}
+
+static const struct iomap_ops fuse_iomap_ops = {
+ .iomap_begin = fuse_iomap_begin,
+};
+
+struct fuse_fill_read_data {
+ struct file *file;
+
+ /* Fields below are used if sending the read request asynchronously */
+ struct fuse_conn *fc;
+ struct fuse_io_args *ia;
+ unsigned int nr_bytes;
+};
+
+/* forward declarations */
+static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos,
+ unsigned len, struct fuse_args_pages *ap,
+ unsigned cur_bytes, bool write);
+static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file,
+ unsigned int count, bool async);
+
+static int fuse_handle_readahead(struct folio *folio,
+ struct readahead_control *rac,
+ struct fuse_fill_read_data *data, loff_t pos,
+ size_t len)
+{
+ struct fuse_io_args *ia = data->ia;
+ size_t off = offset_in_folio(folio, pos);
+ struct fuse_conn *fc = data->fc;
+ struct fuse_args_pages *ap;
+ unsigned int nr_pages;
+
+ if (ia && fuse_folios_need_send(fc, pos, len, &ia->ap, data->nr_bytes,
+ false)) {
+ fuse_send_readpages(ia, data->file, data->nr_bytes,
+ fc->async_read);
+ data->nr_bytes = 0;
+ data->ia = NULL;
+ ia = NULL;
+ }
+ if (!ia) {
+ if (fc->num_background >= fc->congestion_threshold &&
+ rac->ra->async_size >= readahead_count(rac))
+ /*
+ * Congested and only async pages left, so skip the
+ * rest.
+ */
+ return -EAGAIN;
+
+ nr_pages = min(fc->max_pages, readahead_count(rac));
+ data->ia = fuse_io_alloc(NULL, nr_pages);
+ if (!data->ia)
+ return -ENOMEM;
+ ia = data->ia;
+ }
+ folio_get(folio);
+ ap = &ia->ap;
+ ap->folios[ap->num_folios] = folio;
+ ap->descs[ap->num_folios].offset = off;
+ ap->descs[ap->num_folios].length = len;
+ data->nr_bytes += len;
+ ap->num_folios++;
return 0;
}
+static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter,
+ struct iomap_read_folio_ctx *ctx,
+ size_t len)
+{
+ struct fuse_fill_read_data *data = ctx->read_ctx;
+ struct folio *folio = ctx->cur_folio;
+ loff_t pos = iter->pos;
+ size_t off = offset_in_folio(folio, pos);
+ struct file *file = data->file;
+ int ret;
+
+ if (ctx->rac) {
+ ret = fuse_handle_readahead(folio, ctx->rac, data, pos, len);
+ } else {
+ /*
+ * for non-readahead read requests, do reads synchronously
+ * since it's not guaranteed that the server can handle
+ * out-of-order reads
+ */
+ ret = fuse_do_readfolio(file, folio, off, len);
+ if (!ret)
+ iomap_finish_folio_read(folio, off, len, ret);
+ }
+ return ret;
+}
+
+static void fuse_iomap_read_submit(struct iomap_read_folio_ctx *ctx)
+{
+ struct fuse_fill_read_data *data = ctx->read_ctx;
+
+ if (data->ia)
+ fuse_send_readpages(data->ia, data->file, data->nr_bytes,
+ data->fc->async_read);
+}
+
+static const struct iomap_read_ops fuse_iomap_read_ops = {
+ .read_folio_range = fuse_iomap_read_folio_range_async,
+ .submit_read = fuse_iomap_read_submit,
+};
+
static int fuse_read_folio(struct file *file, struct folio *folio)
{
struct inode *inode = folio->mapping->host;
- int err;
+ struct fuse_fill_read_data data = {
+ .file = file,
+ };
+ struct iomap_read_folio_ctx ctx = {
+ .cur_folio = folio,
+ .ops = &fuse_iomap_read_ops,
+ .read_ctx = &data,
- err = -EIO;
- if (fuse_is_bad(inode))
- goto out;
+ };
- err = fuse_do_readfolio(file, folio);
+ if (fuse_is_bad(inode)) {
+ folio_unlock(folio);
+ return -EIO;
+ }
+
+ iomap_read_folio(&fuse_iomap_ops, &ctx);
fuse_invalidate_atime(inode);
- out:
- folio_unlock(folio);
- return err;
+ return 0;
+}
+
+static int fuse_iomap_read_folio_range(const struct iomap_iter *iter,
+ struct folio *folio, loff_t pos,
+ size_t len)
+{
+ struct file *file = iter->private;
+ size_t off = offset_in_folio(folio, pos);
+
+ return fuse_do_readfolio(file, folio, off, len);
}
static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
@@ -849,25 +1002,24 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
struct fuse_args_pages *ap = &ia->ap;
size_t count = ia->read.in.size;
size_t num_read = args->out_args[0].size;
- struct address_space *mapping = NULL;
-
- for (i = 0; mapping == NULL && i < ap->num_folios; i++)
- mapping = ap->folios[i]->mapping;
+ struct address_space *mapping;
+ struct inode *inode;
- if (mapping) {
- struct inode *inode = mapping->host;
+ WARN_ON_ONCE(!ap->num_folios);
+ mapping = ap->folios[0]->mapping;
+ inode = mapping->host;
- /*
- * Short read means EOF. If file size is larger, truncate it
- */
- if (!err && num_read < count)
- fuse_short_read(inode, ia->read.attr_ver, num_read, ap);
+ /*
+ * Short read means EOF. If file size is larger, truncate it
+ */
+ if (!err && num_read < count)
+ fuse_short_read(inode, ia->read.attr_ver, num_read, ap);
- fuse_invalidate_atime(inode);
- }
+ fuse_invalidate_atime(inode);
for (i = 0; i < ap->num_folios; i++) {
- folio_end_read(ap->folios[i], !err);
+ iomap_finish_folio_read(ap->folios[i], ap->descs[i].offset,
+ ap->descs[i].length, err);
folio_put(ap->folios[i]);
}
if (ia->ff)
@@ -877,7 +1029,7 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
}
static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file,
- unsigned int count)
+ unsigned int count, bool async)
{
struct fuse_file *ff = file->private_data;
struct fuse_mount *fm = ff->fm;
@@ -899,7 +1051,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file,
fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
ia->read.attr_ver = fuse_get_attr_version(fm->fc);
- if (fm->fc->async_read) {
+ if (async) {
ia->ff = fuse_file_get(ff);
ap->args.end = fuse_readpages_end;
err = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
@@ -916,81 +1068,20 @@ static void fuse_readahead(struct readahead_control *rac)
{
struct inode *inode = rac->mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
- unsigned int max_pages, nr_pages;
- struct folio *folio = NULL;
+ struct fuse_fill_read_data data = {
+ .file = rac->file,
+ .fc = fc,
+ };
+ struct iomap_read_folio_ctx ctx = {
+ .ops = &fuse_iomap_read_ops,
+ .rac = rac,
+ .read_ctx = &data
+ };
if (fuse_is_bad(inode))
return;
- max_pages = min_t(unsigned int, fc->max_pages,
- fc->max_read / PAGE_SIZE);
-
- /*
- * This is only accurate the first time through, since readahead_folio()
- * doesn't update readahead_count() from the previous folio until the
- * next call. Grab nr_pages here so we know how many pages we're going
- * to have to process. This means that we will exit here with
- * readahead_count() == folio_nr_pages(last_folio), but we will have
- * consumed all of the folios, and read_pages() will call
- * readahead_folio() again which will clean up the rac.
- */
- nr_pages = readahead_count(rac);
-
- while (nr_pages) {
- struct fuse_io_args *ia;
- struct fuse_args_pages *ap;
- unsigned cur_pages = min(max_pages, nr_pages);
- unsigned int pages = 0;
-
- if (fc->num_background >= fc->congestion_threshold &&
- rac->ra->async_size >= readahead_count(rac))
- /*
- * Congested and only async pages left, so skip the
- * rest.
- */
- break;
-
- ia = fuse_io_alloc(NULL, cur_pages);
- if (!ia)
- break;
- ap = &ia->ap;
-
- while (pages < cur_pages) {
- unsigned int folio_pages;
-
- /*
- * This returns a folio with a ref held on it.
- * The ref needs to be held until the request is
- * completed, since the splice case (see
- * fuse_try_move_page()) drops the ref after it's
- * replaced in the page cache.
- */
- if (!folio)
- folio = __readahead_folio(rac);
-
- folio_pages = folio_nr_pages(folio);
- if (folio_pages > cur_pages - pages) {
- /*
- * Large folios belonging to fuse will never
- * have more pages than max_pages.
- */
- WARN_ON(!pages);
- break;
- }
-
- ap->folios[ap->num_folios] = folio;
- ap->descs[ap->num_folios].length = folio_size(folio);
- ap->num_folios++;
- pages += folio_pages;
- folio = NULL;
- }
- fuse_send_readpages(ia, rac->file, pages << PAGE_SHIFT);
- nr_pages -= pages;
- }
- if (folio) {
- folio_end_read(folio, false);
- folio_put(folio);
- }
+ iomap_readahead(&fuse_iomap_ops, &ctx);
}
static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -1147,7 +1238,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
struct address_space *mapping,
struct iov_iter *ii, loff_t pos,
- unsigned int max_pages)
+ unsigned int max_folios)
{
struct fuse_args_pages *ap = &ia->ap;
struct fuse_conn *fc = get_fuse_conn(mapping->host);
@@ -1157,12 +1248,10 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
int err = 0;
num = min(iov_iter_count(ii), fc->max_write);
- num = min(num, max_pages << PAGE_SHIFT);
ap->args.in_pages = true;
- ap->descs[0].offset = offset;
- while (num) {
+ while (num && ap->num_folios < max_folios) {
size_t tmp;
struct folio *folio;
pgoff_t index = pos >> PAGE_SHIFT;
@@ -1375,6 +1464,10 @@ static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive)
}
}
+static const struct iomap_write_ops fuse_iomap_write_ops = {
+ .read_folio_range = fuse_iomap_read_folio_range,
+};
+
static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
@@ -1384,6 +1477,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = mapping->host;
ssize_t err, count;
struct fuse_conn *fc = get_fuse_conn(inode);
+ bool writeback = false;
if (fc->writeback_cache) {
/* Update size (EOF optimization) and mode (SUID clearing) */
@@ -1392,16 +1486,11 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (err)
return err;
- if (fc->handle_killpriv_v2 &&
- setattr_should_drop_suidgid(idmap,
- file_inode(file))) {
- goto writethrough;
- }
-
- return generic_file_write_iter(iocb, from);
+ if (!fc->handle_killpriv_v2 ||
+ !setattr_should_drop_suidgid(idmap, file_inode(file)))
+ writeback = true;
}
-writethrough:
inode_lock(inode);
err = count = generic_write_checks(iocb, from);
@@ -1420,6 +1509,15 @@ writethrough:
goto out;
written = direct_write_fallback(iocb, from, written,
fuse_perform_write(iocb, from));
+ } else if (writeback) {
+ /*
+ * Use iomap so that we can do granular uptodate reads
+ * and granular dirty tracking for large folios.
+ */
+ written = iomap_file_buffered_write(iocb, from,
+ &fuse_iomap_ops,
+ &fuse_iomap_write_ops,
+ file);
} else {
written = fuse_perform_write(iocb, from);
}
@@ -1566,7 +1664,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
if (!ia)
return -ENOMEM;
- if (fopen_direct_io && fc->direct_io_allow_mmap) {
+ if (fopen_direct_io) {
res = filemap_write_and_wait_range(mapping, pos, pos + count - 1);
if (res) {
fuse_io_free(ia);
@@ -1640,6 +1738,15 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
if (res > 0)
*ppos = pos;
+ if (res > 0 && write && fopen_direct_io) {
+ /*
+ * As in generic_file_direct_write(), invalidate after the
+ * write, to invalidate read-ahead cache that may have competed
+ * with the write.
+ */
+ invalidate_inode_pages2_range(mapping, idx_from, idx_to);
+ }
+
return res > 0 ? res : err;
}
EXPORT_SYMBOL_GPL(fuse_direct_io);
@@ -1785,19 +1892,16 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa)
struct fuse_args_pages *ap = &wpa->ia.ap;
struct inode *inode = wpa->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
- struct backing_dev_info *bdi = inode_to_bdi(inode);
int i;
- for (i = 0; i < ap->num_folios; i++) {
+ for (i = 0; i < ap->num_folios; i++)
/*
* Benchmarks showed that ending writeback within the
* scope of the fi->lock alleviates xarray lock
* contention and noticeably improves performance.
*/
- folio_end_writeback(ap->folios[i]);
- dec_wb_stat(&bdi->wb, WB_WRITEBACK);
- wb_writeout_inc(&bdi->wb);
- }
+ iomap_finish_folio_write(inode, ap->folios[i],
+ ap->descs[i].length);
wake_up(&fi->page_waitq);
}
@@ -1928,17 +2032,6 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
struct fuse_file *ff;
int err;
- /*
- * Inode is always written before the last reference is dropped and
- * hence this should not be reached from reclaim.
- *
- * Writing back the inode from reclaim can deadlock if the request
- * processing itself needs an allocation. Allocations triggering
- * reclaim while serving a request can't be prevented, because it can
- * involve any number of unrelated userspace processes.
- */
- WARN_ON(wbc->for_reclaim);
-
ff = __fuse_write_file_get(fi);
err = fuse_flush_times(inode, ff);
if (ff)
@@ -1981,19 +2074,17 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
}
static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio,
- uint32_t folio_index)
+ uint32_t folio_index, loff_t offset, unsigned len)
{
- struct inode *inode = folio->mapping->host;
struct fuse_args_pages *ap = &wpa->ia.ap;
ap->folios[folio_index] = folio;
- ap->descs[folio_index].offset = 0;
- ap->descs[folio_index].length = folio_size(folio);
-
- inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
+ ap->descs[folio_index].offset = offset;
+ ap->descs[folio_index].length = len;
}
static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio,
+ size_t offset,
struct fuse_file *ff)
{
struct inode *inode = folio->mapping->host;
@@ -2006,7 +2097,7 @@ static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio
return NULL;
fuse_writepage_add_to_bucket(fc, wpa);
- fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio), 0);
+ fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio) + offset, 0);
wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
wpa->inode = inode;
wpa->ia.ff = ff;
@@ -2018,63 +2109,28 @@ static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio
return wpa;
}
-static int fuse_writepage_locked(struct folio *folio)
-{
- struct address_space *mapping = folio->mapping;
- struct inode *inode = mapping->host;
- struct fuse_inode *fi = get_fuse_inode(inode);
- struct fuse_writepage_args *wpa;
- struct fuse_args_pages *ap;
- struct fuse_file *ff;
- int error = -EIO;
-
- ff = fuse_write_file_get(fi);
- if (!ff)
- goto err;
-
- wpa = fuse_writepage_args_setup(folio, ff);
- error = -ENOMEM;
- if (!wpa)
- goto err_writepage_args;
-
- ap = &wpa->ia.ap;
- ap->num_folios = 1;
-
- folio_start_writeback(folio);
- fuse_writepage_args_page_fill(wpa, folio, 0);
-
- spin_lock(&fi->lock);
- list_add_tail(&wpa->queue_entry, &fi->queued_writes);
- fuse_flush_writepages(inode);
- spin_unlock(&fi->lock);
-
- return 0;
-
-err_writepage_args:
- fuse_file_put(ff, false);
-err:
- mapping_set_error(folio->mapping, error);
- return error;
-}
-
struct fuse_fill_wb_data {
struct fuse_writepage_args *wpa;
struct fuse_file *ff;
- struct inode *inode;
unsigned int max_folios;
- unsigned int nr_pages;
+ /*
+ * nr_bytes won't overflow since fuse_folios_need_send() caps
+ * wb requests to never exceed fc->max_pages (which has an upper bound
+ * of U16_MAX).
+ */
+ unsigned int nr_bytes;
};
-static bool fuse_pages_realloc(struct fuse_fill_wb_data *data)
+static bool fuse_pages_realloc(struct fuse_fill_wb_data *data,
+ unsigned int max_pages)
{
struct fuse_args_pages *ap = &data->wpa->ia.ap;
- struct fuse_conn *fc = get_fuse_conn(data->inode);
struct folio **folios;
struct fuse_folio_desc *descs;
unsigned int nfolios = min_t(unsigned int,
max_t(unsigned int, data->max_folios * 2,
FUSE_DEFAULT_MAX_PAGES_PER_REQ),
- fc->max_pages);
+ max_pages);
WARN_ON(nfolios <= data->max_folios);
folios = fuse_folios_alloc(nfolios, GFP_NOFS, &descs);
@@ -2091,10 +2147,10 @@ static bool fuse_pages_realloc(struct fuse_fill_wb_data *data)
return true;
}
-static void fuse_writepages_send(struct fuse_fill_wb_data *data)
+static void fuse_writepages_send(struct inode *inode,
+ struct fuse_fill_wb_data *data)
{
struct fuse_writepage_args *wpa = data->wpa;
- struct inode *inode = data->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
spin_lock(&fi->lock);
@@ -2103,195 +2159,156 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data)
spin_unlock(&fi->lock);
}
-static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio,
- struct fuse_args_pages *ap,
- struct fuse_fill_wb_data *data)
+static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos,
+ unsigned len, struct fuse_args_pages *ap,
+ unsigned cur_bytes, bool write)
{
+ struct folio *prev_folio;
+ struct fuse_folio_desc prev_desc;
+ unsigned bytes = cur_bytes + len;
+ loff_t prev_pos;
+ size_t max_bytes = write ? fc->max_write : fc->max_read;
+
WARN_ON(!ap->num_folios);
/* Reached max pages */
- if (data->nr_pages + folio_nr_pages(folio) > fc->max_pages)
+ if ((bytes + PAGE_SIZE - 1) >> PAGE_SHIFT > fc->max_pages)
return true;
- /* Reached max write bytes */
- if ((data->nr_pages * PAGE_SIZE) + folio_size(folio) > fc->max_write)
+ if (bytes > max_bytes)
return true;
/* Discontinuity */
- if (folio_next_index(ap->folios[ap->num_folios - 1]) != folio->index)
- return true;
-
- /* Need to grow the pages array? If so, did the expansion fail? */
- if (ap->num_folios == data->max_folios && !fuse_pages_realloc(data))
+ prev_folio = ap->folios[ap->num_folios - 1];
+ prev_desc = ap->descs[ap->num_folios - 1];
+ prev_pos = folio_pos(prev_folio) + prev_desc.offset + prev_desc.length;
+ if (prev_pos != pos)
return true;
return false;
}
-static int fuse_writepages_fill(struct folio *folio,
- struct writeback_control *wbc, void *_data)
+static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc,
+ struct folio *folio, u64 pos,
+ unsigned len, u64 end_pos)
{
- struct fuse_fill_wb_data *data = _data;
+ struct fuse_fill_wb_data *data = wpc->wb_ctx;
struct fuse_writepage_args *wpa = data->wpa;
struct fuse_args_pages *ap = &wpa->ia.ap;
- struct inode *inode = data->inode;
+ struct inode *inode = wpc->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = get_fuse_conn(inode);
- int err;
+ loff_t offset = offset_in_folio(folio, pos);
+
+ WARN_ON_ONCE(!data);
if (!data->ff) {
- err = -EIO;
data->ff = fuse_write_file_get(fi);
if (!data->ff)
- goto out_unlock;
+ return -EIO;
}
- if (wpa && fuse_writepage_need_send(fc, folio, ap, data)) {
- fuse_writepages_send(data);
- data->wpa = NULL;
- data->nr_pages = 0;
+ if (wpa) {
+ bool send = fuse_folios_need_send(fc, pos, len, ap,
+ data->nr_bytes, true);
+
+ if (!send) {
+ /*
+ * Need to grow the pages array? If so, did the
+ * expansion fail?
+ */
+ send = (ap->num_folios == data->max_folios) &&
+ !fuse_pages_realloc(data, fc->max_pages);
+ }
+
+ if (send) {
+ fuse_writepages_send(inode, data);
+ data->wpa = NULL;
+ data->nr_bytes = 0;
+ }
}
if (data->wpa == NULL) {
- err = -ENOMEM;
- wpa = fuse_writepage_args_setup(folio, data->ff);
+ wpa = fuse_writepage_args_setup(folio, offset, data->ff);
if (!wpa)
- goto out_unlock;
+ return -ENOMEM;
fuse_file_get(wpa->ia.ff);
data->max_folios = 1;
ap = &wpa->ia.ap;
}
- folio_start_writeback(folio);
- fuse_writepage_args_page_fill(wpa, folio, ap->num_folios);
- data->nr_pages += folio_nr_pages(folio);
+ fuse_writepage_args_page_fill(wpa, folio, ap->num_folios,
+ offset, len);
+ data->nr_bytes += len;
- err = 0;
ap->num_folios++;
if (!data->wpa)
data->wpa = wpa;
-out_unlock:
- folio_unlock(folio);
- return err;
+ return len;
}
-static int fuse_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+static int fuse_iomap_writeback_submit(struct iomap_writepage_ctx *wpc,
+ int error)
{
- struct inode *inode = mapping->host;
- struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_fill_wb_data data;
- int err;
-
- err = -EIO;
- if (fuse_is_bad(inode))
- goto out;
-
- if (wbc->sync_mode == WB_SYNC_NONE &&
- fc->num_background >= fc->congestion_threshold)
- return 0;
+ struct fuse_fill_wb_data *data = wpc->wb_ctx;
- data.inode = inode;
- data.wpa = NULL;
- data.ff = NULL;
- data.nr_pages = 0;
+ WARN_ON_ONCE(!data);
- err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
- if (data.wpa) {
- WARN_ON(!data.wpa->ia.ap.num_folios);
- fuse_writepages_send(&data);
+ if (data->wpa) {
+ WARN_ON(!data->wpa->ia.ap.num_folios);
+ fuse_writepages_send(wpc->inode, data);
}
- if (data.ff)
- fuse_file_put(data.ff, false);
-out:
- return err;
-}
-
-/*
- * It's worthy to make sure that space is reserved on disk for the write,
- * but how to implement it without killing performance need more thinking.
- */
-static int fuse_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
-{
- pgoff_t index = pos >> PAGE_SHIFT;
- struct fuse_conn *fc = get_fuse_conn(file_inode(file));
- struct folio *folio;
- loff_t fsize;
- int err = -ENOMEM;
+ if (data->ff)
+ fuse_file_put(data->ff, false);
- WARN_ON(!fc->writeback_cache);
-
- folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
- mapping_gfp_mask(mapping));
- if (IS_ERR(folio))
- goto error;
-
- if (folio_test_uptodate(folio) || len >= folio_size(folio))
- goto success;
- /*
- * Check if the start of this folio comes after the end of file,
- * in which case the readpage can be optimized away.
- */
- fsize = i_size_read(mapping->host);
- if (fsize <= folio_pos(folio)) {
- size_t off = offset_in_folio(folio, pos);
- if (off)
- folio_zero_segment(folio, 0, off);
- goto success;
- }
- err = fuse_do_readfolio(file, folio);
- if (err)
- goto cleanup;
-success:
- *foliop = folio;
- return 0;
-
-cleanup:
- folio_unlock(folio);
- folio_put(folio);
-error:
- return err;
+ return error;
}
-static int fuse_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
-{
- struct inode *inode = folio->mapping->host;
-
- /* Haven't copied anything? Skip zeroing, size extending, dirtying. */
- if (!copied)
- goto unlock;
-
- pos += copied;
- if (!folio_test_uptodate(folio)) {
- /* Zero any unwritten bytes at the end of the page */
- size_t endoff = pos & ~PAGE_MASK;
- if (endoff)
- folio_zero_segment(folio, endoff, PAGE_SIZE);
- folio_mark_uptodate(folio);
- }
+static const struct iomap_writeback_ops fuse_writeback_ops = {
+ .writeback_range = fuse_iomap_writeback_range,
+ .writeback_submit = fuse_iomap_writeback_submit,
+};
- if (pos > inode->i_size)
- i_size_write(inode, pos);
+static int fuse_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_fill_wb_data data = {};
+ struct iomap_writepage_ctx wpc = {
+ .inode = inode,
+ .iomap.type = IOMAP_MAPPED,
+ .wbc = wbc,
+ .ops = &fuse_writeback_ops,
+ .wb_ctx = &data,
+ };
- folio_mark_dirty(folio);
+ if (fuse_is_bad(inode))
+ return -EIO;
-unlock:
- folio_unlock(folio);
- folio_put(folio);
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ fc->num_background >= fc->congestion_threshold)
+ return 0;
- return copied;
+ return iomap_writepages(&wpc);
}
static int fuse_launder_folio(struct folio *folio)
{
int err = 0;
+ struct fuse_fill_wb_data data = {};
+ struct iomap_writepage_ctx wpc = {
+ .inode = folio->mapping->host,
+ .iomap.type = IOMAP_MAPPED,
+ .ops = &fuse_writeback_ops,
+ .wb_ctx = &data,
+ };
+
if (folio_clear_dirty_for_io(folio)) {
- err = fuse_writepage_locked(folio);
+ err = iomap_writeback_folio(&wpc, folio);
+ err = fuse_iomap_writeback_submit(&wpc, err);
if (!err)
folio_wait_writeback(folio);
}
@@ -3018,6 +3035,8 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
.flags = flags
};
struct fuse_write_out outarg;
+ struct fuse_copy_file_range_out outarg_64;
+ u64 bytes_copied;
ssize_t err;
/* mark unstable when write-back is not used, and file_out gets
* extended */
@@ -3067,30 +3086,51 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
if (is_unstable)
set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
- args.opcode = FUSE_COPY_FILE_RANGE;
+ args.opcode = FUSE_COPY_FILE_RANGE_64;
args.nodeid = ff_in->nodeid;
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
args.in_args[0].value = &inarg;
args.out_numargs = 1;
- args.out_args[0].size = sizeof(outarg);
- args.out_args[0].value = &outarg;
+ args.out_args[0].size = sizeof(outarg_64);
+ args.out_args[0].value = &outarg_64;
+ if (fc->no_copy_file_range_64) {
+fallback:
+ /* Fall back to old op that can't handle large copy length */
+ args.opcode = FUSE_COPY_FILE_RANGE;
+ args.out_args[0].size = sizeof(outarg);
+ args.out_args[0].value = &outarg;
+ inarg.len = len = min_t(size_t, len, UINT_MAX & PAGE_MASK);
+ }
err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
- fc->no_copy_file_range = 1;
- err = -EOPNOTSUPP;
+ if (fc->no_copy_file_range_64) {
+ fc->no_copy_file_range = 1;
+ err = -EOPNOTSUPP;
+ } else {
+ fc->no_copy_file_range_64 = 1;
+ goto fallback;
+ }
}
if (err)
goto out;
+ bytes_copied = fc->no_copy_file_range_64 ?
+ outarg.size : outarg_64.bytes_copied;
+
+ if (bytes_copied > len) {
+ err = -EIO;
+ goto out;
+ }
+
truncate_inode_pages_range(inode_out->i_mapping,
ALIGN_DOWN(pos_out, PAGE_SIZE),
- ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1);
+ ALIGN(pos_out + bytes_copied, PAGE_SIZE) - 1);
file_update_time(file_out);
- fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size);
+ fuse_write_update_attr(inode_out, pos_out + bytes_copied, bytes_copied);
- err = outarg.size;
+ err = bytes_copied;
out:
if (is_unstable)
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
@@ -3144,12 +3184,13 @@ static const struct address_space_operations fuse_file_aops = {
.readahead = fuse_readahead,
.writepages = fuse_writepages,
.launder_folio = fuse_launder_folio,
- .dirty_folio = filemap_dirty_folio,
+ .dirty_folio = iomap_dirty_folio,
+ .release_folio = iomap_release_folio,
+ .invalidate_folio = iomap_invalidate_folio,
+ .is_partially_uptodate = iomap_is_partially_uptodate,
.migrate_folio = filemap_migrate_folio,
.bmap = fuse_bmap,
.direct_IO = fuse_direct_IO,
- .write_begin = fuse_write_begin,
- .write_end = fuse_write_end,
};
void fuse_init_file_inode(struct inode *inode, unsigned int flags)
diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h
index 5a9bd771a319..134bf44aff0d 100644
--- a/fs/fuse/fuse_dev_i.h
+++ b/fs/fuse/fuse_dev_i.h
@@ -12,6 +12,8 @@
#define FUSE_INT_REQ_BIT (1ULL << 0)
#define FUSE_REQ_ID_STEP (1ULL << 1)
+extern struct wait_queue_head fuse_dev_waitq;
+
struct fuse_arg;
struct fuse_args;
struct fuse_pqueue;
@@ -37,15 +39,22 @@ struct fuse_copy_state {
} ring;
};
-static inline struct fuse_dev *fuse_get_dev(struct file *file)
+#define FUSE_DEV_SYNC_INIT ((struct fuse_dev *) 1)
+#define FUSE_DEV_PTR_MASK (~1UL)
+
+static inline struct fuse_dev *__fuse_get_dev(struct file *file)
{
/*
* Lockless access is OK, because file->private data is set
* once during mount and is valid until the file is released.
*/
- return READ_ONCE(file->private_data);
+ struct fuse_dev *fud = READ_ONCE(file->private_data);
+
+ return (typeof(fud)) ((unsigned long) fud & FUSE_DEV_PTR_MASK);
}
+struct fuse_dev *fuse_get_dev(struct file *file);
+
unsigned int fuse_req_hash(u64 unique);
struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique);
@@ -53,6 +62,7 @@ void fuse_dev_end_requests(struct list_head *head);
void fuse_copy_init(struct fuse_copy_state *cs, bool write,
struct iov_iter *iter);
+void fuse_copy_finish(struct fuse_copy_state *cs);
int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs,
unsigned int argpages, struct fuse_arg *args,
int zeroing);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index b54f4f57789f..7f16049387d1 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -54,6 +54,13 @@
/** Frequency (in jiffies) of request timeout checks, if opted into */
extern const unsigned long fuse_timeout_timer_freq;
+/*
+ * Dentries invalidation workqueue period, in seconds. The value of this
+ * parameter shall be >= FUSE_DENTRY_INVAL_FREQ_MIN seconds, or 0 (zero), in
+ * which case no workqueue will be created.
+ */
+extern unsigned inval_wq __read_mostly;
+
/** Maximum of max_pages received in init_out */
extern unsigned int fuse_max_pages_limit;
/*
@@ -210,6 +217,12 @@ struct fuse_inode {
/** Reference to backing file in passthrough mode */
struct fuse_backing *fb;
#endif
+
+ /*
+ * The underlying inode->i_blkbits value will not be modified,
+ * so preserve the blocksize specified by the server.
+ */
+ u8 cached_i_blkbits;
};
/** FUSE inode state bits */
@@ -226,6 +239,11 @@ enum {
FUSE_I_BTIME,
/* Wants or already has page cache IO */
FUSE_I_CACHE_IO_MODE,
+ /*
+ * Client has exclusive access to the inode, either because fs is local
+ * or the fuse server has an exclusive "lease" on distributed fs
+ */
+ FUSE_I_EXCLUSIVE,
};
struct fuse_conn;
@@ -636,6 +654,8 @@ struct fuse_conn {
/** Current epoch for up-to-date dentries */
atomic_t epoch;
+ struct work_struct epoch_work;
+
struct rcu_head rcu;
/** The user id for this mount */
@@ -850,6 +870,9 @@ struct fuse_conn {
/** Does the filesystem support copy_file_range? */
unsigned no_copy_file_range:1;
+ /** Does the filesystem support copy_file_range_64? */
+ unsigned no_copy_file_range_64:1;
+
/* Send DESTROY request */
unsigned int destroy:1;
@@ -895,6 +918,9 @@ struct fuse_conn {
/* Is link not implemented by fs? */
unsigned int no_link:1;
+ /* Is synchronous FUSE_INIT allowed? */
+ unsigned int sync_init:1;
+
/* Use io_uring for communication */
unsigned int io_uring;
@@ -913,12 +939,6 @@ struct fuse_conn {
/** Device ID from the root super block */
dev_t dev;
- /** Dentries in the control filesystem */
- struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES];
-
- /** number of dentries used in the above array */
- int ctl_ndents;
-
/** Key for lock owner ID scrambling */
u32 scramble_key[4];
@@ -1032,7 +1052,7 @@ static inline struct fuse_conn *get_fuse_conn(struct inode *inode)
return get_fuse_mount_super(inode->i_sb)->fc;
}
-static inline struct fuse_inode *get_fuse_inode(struct inode *inode)
+static inline struct fuse_inode *get_fuse_inode(const struct inode *inode)
{
return container_of(inode, struct fuse_inode, inode);
}
@@ -1074,6 +1094,13 @@ static inline bool fuse_is_bad(struct inode *inode)
return unlikely(test_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state));
}
+static inline bool fuse_inode_is_exclusive(const struct inode *inode)
+{
+ const struct fuse_inode *fi = get_fuse_inode(inode);
+
+ return test_bit(FUSE_I_EXCLUSIVE, &fi->state);
+}
+
static inline struct folio **fuse_folios_alloc(unsigned int nfolios, gfp_t flags,
struct fuse_folio_desc **desc)
{
@@ -1109,7 +1136,6 @@ static inline void fuse_sync_bucket_dec(struct fuse_sync_bucket *bucket)
extern const struct file_operations fuse_dev_operations;
extern const struct dentry_operations fuse_dentry_operations;
-extern const struct dentry_operations fuse_root_dentry_operations;
/**
* Get a filled in inode
@@ -1248,6 +1274,11 @@ int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args,
gfp_t gfp_flags);
/**
+ * Assign a unique id to a fuse request
+ */
+void fuse_request_assign_unique(struct fuse_iqueue *fiq, struct fuse_req *req);
+
+/**
* End a finished request
*/
void fuse_request_end(struct fuse_req *req);
@@ -1259,6 +1290,11 @@ void fuse_wait_aborted(struct fuse_conn *fc);
/* Check if any requests timed out */
void fuse_check_timeout(struct work_struct *work);
+void fuse_dentry_tree_init(void);
+void fuse_dentry_tree_cleanup(void);
+
+void fuse_epoch_work(struct work_struct *work);
+
/**
* Invalidate inode attributes
*/
@@ -1308,7 +1344,7 @@ struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc);
struct fuse_dev *fuse_dev_alloc(void);
void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc);
void fuse_dev_free(struct fuse_dev *fud);
-void fuse_send_init(struct fuse_mount *fm);
+int fuse_send_init(struct fuse_mount *fm);
/**
* Fill in superblock and initialize fuse connection
@@ -1400,6 +1436,12 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
u64 child_nodeid, struct qstr *name, u32 flags);
+/*
+ * Try to prune this inode. If neither the inode itself nor dentries associated
+ * with this inode have any external reference, then the inode can be freed.
+ */
+void fuse_try_prune_one_inode(struct fuse_conn *fc, u64 nodeid);
+
int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
bool isdir);
@@ -1486,9 +1528,9 @@ void fuse_dax_cancel_work(struct fuse_conn *fc);
long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
unsigned long arg);
-int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int fuse_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
int fuse_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
/* iomode.c */
int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff);
@@ -1505,29 +1547,11 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
void fuse_file_release(struct inode *inode, struct fuse_file *ff,
unsigned int open_flags, fl_owner_t id, bool isdir);
-/* passthrough.c */
-static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi)
-{
-#ifdef CONFIG_FUSE_PASSTHROUGH
- return READ_ONCE(fi->fb);
-#else
- return NULL;
-#endif
-}
-
-static inline struct fuse_backing *fuse_inode_backing_set(struct fuse_inode *fi,
- struct fuse_backing *fb)
-{
-#ifdef CONFIG_FUSE_PASSTHROUGH
- return xchg(&fi->fb, fb);
-#else
- return NULL;
-#endif
-}
-
+/* backing.c */
#ifdef CONFIG_FUSE_PASSTHROUGH
struct fuse_backing *fuse_backing_get(struct fuse_backing *fb);
void fuse_backing_put(struct fuse_backing *fb);
+struct fuse_backing *fuse_backing_lookup(struct fuse_conn *fc, int backing_id);
#else
static inline struct fuse_backing *fuse_backing_get(struct fuse_backing *fb)
@@ -1538,6 +1562,11 @@ static inline struct fuse_backing *fuse_backing_get(struct fuse_backing *fb)
static inline void fuse_backing_put(struct fuse_backing *fb)
{
}
+static inline struct fuse_backing *fuse_backing_lookup(struct fuse_conn *fc,
+ int backing_id)
+{
+ return NULL;
+}
#endif
void fuse_backing_files_init(struct fuse_conn *fc);
@@ -1545,9 +1574,27 @@ void fuse_backing_files_free(struct fuse_conn *fc);
int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map);
int fuse_backing_close(struct fuse_conn *fc, int backing_id);
-struct fuse_backing *fuse_passthrough_open(struct file *file,
- struct inode *inode,
- int backing_id);
+/* passthrough.c */
+static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi)
+{
+#ifdef CONFIG_FUSE_PASSTHROUGH
+ return READ_ONCE(fi->fb);
+#else
+ return NULL;
+#endif
+}
+
+static inline struct fuse_backing *fuse_inode_backing_set(struct fuse_inode *fi,
+ struct fuse_backing *fb)
+{
+#ifdef CONFIG_FUSE_PASSTHROUGH
+ return xchg(&fi->fb, fb);
+#else
+ return NULL;
+#endif
+}
+
+struct fuse_backing *fuse_passthrough_open(struct file *file, int backing_id);
void fuse_passthrough_release(struct fuse_file *ff, struct fuse_backing *fb);
static inline struct file *fuse_file_passthrough(struct fuse_file *ff)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index bfe8d8af46f3..819e50d66622 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -7,8 +7,10 @@
*/
#include "fuse_i.h"
+#include "fuse_dev_i.h"
#include "dev_uring_i.h"
+#include <linux/dax.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/file.h>
@@ -33,6 +35,7 @@ MODULE_LICENSE("GPL");
static struct kmem_cache *fuse_inode_cachep;
struct list_head fuse_conn_list;
DEFINE_MUTEX(fuse_mutex);
+DECLARE_WAIT_QUEUE_HEAD(fuse_dev_waitq);
static int set_global_limit(const char *val, const struct kernel_param *kp);
@@ -100,14 +103,11 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
if (!fi)
return NULL;
- fi->i_time = 0;
+ /* Initialize private data (i.e. everything except fi->inode) */
+ BUILD_BUG_ON(offsetof(struct fuse_inode, inode) != 0);
+ memset((void *) fi + sizeof(fi->inode), 0, sizeof(*fi) - sizeof(fi->inode));
+
fi->inval_mask = ~0;
- fi->nodeid = 0;
- fi->nlookup = 0;
- fi->attr_version = 0;
- fi->orig_ino = 0;
- fi->state = 0;
- fi->submount_lookup = NULL;
mutex_init(&fi->mutex);
spin_lock_init(&fi->lock);
fi->forget = fuse_alloc_forget();
@@ -160,7 +160,10 @@ static void fuse_evict_inode(struct inode *inode)
struct fuse_inode *fi = get_fuse_inode(inode);
/* Will write inode on close/munmap and in all other dirtiers */
- WARN_ON(inode->i_state & I_DIRTY_INODE);
+ WARN_ON(inode_state_read_once(inode) & I_DIRTY_INODE);
+
+ if (FUSE_IS_DAX(inode))
+ dax_break_layout_final(inode);
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
@@ -285,10 +288,10 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
}
}
- if (attr->blksize != 0)
- inode->i_blkbits = ilog2(attr->blksize);
+ if (attr->blksize)
+ fi->cached_i_blkbits = ilog2(attr->blksize);
else
- inode->i_blkbits = inode->i_sb->s_blocksize_bits;
+ fi->cached_i_blkbits = inode->i_sb->s_blocksize_bits;
/*
* Don't set the sticky bit in i_mode, unless we want the VFS
@@ -502,7 +505,7 @@ retry:
if (!inode)
return NULL;
- if ((inode->i_state & I_NEW)) {
+ if ((inode_state_read_once(inode) & I_NEW)) {
inode->i_flags |= S_NOATIME;
if (!fc->writeback_cache || !S_ISREG(attr->mode))
inode->i_flags |= S_NOCMTIME;
@@ -582,6 +585,17 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
return 0;
}
+void fuse_try_prune_one_inode(struct fuse_conn *fc, u64 nodeid)
+{
+ struct inode *inode;
+
+ inode = fuse_ilookup(fc, nodeid, NULL);
+ if (!inode)
+ return;
+ d_prune_aliases(inode);
+ iput(inode);
+}
+
bool fuse_lock_inode(struct inode *inode)
{
bool locked = false;
@@ -963,6 +977,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
refcount_set(&fc->count, 1);
atomic_set(&fc->dev_count, 1);
atomic_set(&fc->epoch, 1);
+ INIT_WORK(&fc->epoch_work, fuse_epoch_work);
init_waitqueue_head(&fc->blocked_waitq);
fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv);
INIT_LIST_HEAD(&fc->bg_queue);
@@ -1007,26 +1022,28 @@ static void delayed_release(struct rcu_head *p)
void fuse_conn_put(struct fuse_conn *fc)
{
- if (refcount_dec_and_test(&fc->count)) {
- struct fuse_iqueue *fiq = &fc->iq;
- struct fuse_sync_bucket *bucket;
-
- if (IS_ENABLED(CONFIG_FUSE_DAX))
- fuse_dax_conn_free(fc);
- if (fc->timeout.req_timeout)
- cancel_delayed_work_sync(&fc->timeout.work);
- if (fiq->ops->release)
- fiq->ops->release(fiq);
- put_pid_ns(fc->pid_ns);
- bucket = rcu_dereference_protected(fc->curr_bucket, 1);
- if (bucket) {
- WARN_ON(atomic_read(&bucket->count) != 1);
- kfree(bucket);
- }
- if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
- fuse_backing_files_free(fc);
- call_rcu(&fc->rcu, delayed_release);
+ struct fuse_iqueue *fiq = &fc->iq;
+ struct fuse_sync_bucket *bucket;
+
+ if (!refcount_dec_and_test(&fc->count))
+ return;
+
+ if (IS_ENABLED(CONFIG_FUSE_DAX))
+ fuse_dax_conn_free(fc);
+ if (fc->timeout.req_timeout)
+ cancel_delayed_work_sync(&fc->timeout.work);
+ cancel_work_sync(&fc->epoch_work);
+ if (fiq->ops->release)
+ fiq->ops->release(fiq);
+ put_pid_ns(fc->pid_ns);
+ bucket = rcu_dereference_protected(fc->curr_bucket, 1);
+ if (bucket) {
+ WARN_ON(atomic_read(&bucket->count) != 1);
+ kfree(bucket);
}
+ if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
+ fuse_backing_files_free(fc);
+ call_rcu(&fc->rcu, delayed_release);
}
EXPORT_SYMBOL_GPL(fuse_conn_put);
@@ -1205,7 +1222,7 @@ static const struct super_operations fuse_super_operations = {
.free_inode = fuse_free_inode,
.evict_inode = fuse_evict_inode,
.write_inode = fuse_write_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.umount_begin = fuse_umount_begin,
.statfs = fuse_statfs,
.sync_fs = fuse_sync_fs,
@@ -1269,7 +1286,7 @@ static void set_request_timeout(struct fuse_conn *fc, unsigned int timeout)
{
fc->timeout.req_timeout = secs_to_jiffies(timeout);
INIT_DELAYED_WORK(&fc->timeout.work, fuse_check_timeout);
- queue_delayed_work(system_wq, &fc->timeout.work,
+ queue_delayed_work(system_percpu_wq, &fc->timeout.work,
fuse_timeout_timer_freq);
}
@@ -1465,7 +1482,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
wake_up_all(&fc->blocked_waitq);
}
-void fuse_send_init(struct fuse_mount *fm)
+static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm)
{
struct fuse_init_args *ia;
u64 flags;
@@ -1524,10 +1541,30 @@ void fuse_send_init(struct fuse_mount *fm)
ia->args.out_args[0].value = &ia->out;
ia->args.force = true;
ia->args.nocreds = true;
- ia->args.end = process_init_reply;
- if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0)
- process_init_reply(fm, &ia->args, -ENOTCONN);
+ return ia;
+}
+
+int fuse_send_init(struct fuse_mount *fm)
+{
+ struct fuse_init_args *ia = fuse_new_init(fm);
+ int err;
+
+ if (fm->fc->sync_init) {
+ err = fuse_simple_request(fm, &ia->args);
+ /* Ignore size of init reply */
+ if (err > 0)
+ err = 0;
+ } else {
+ ia->args.end = process_init_reply;
+ err = fuse_simple_background(fm, &ia->args, GFP_KERNEL);
+ if (!err)
+ return 0;
+ }
+ process_init_reply(fm, &ia->args, err);
+ if (fm->fc->conn_error)
+ return -ENOTCONN;
+ return 0;
}
EXPORT_SYMBOL_GPL(fuse_send_init);
@@ -1557,8 +1594,6 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
if (err)
return err;
- /* fuse does it's own writeback accounting */
- sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT;
sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT;
/*
@@ -1715,7 +1750,7 @@ static int fuse_fill_super_submount(struct super_block *sb,
fi = get_fuse_inode(root);
fi->nlookup--;
- sb->s_d_op = &fuse_dentry_operations;
+ set_default_d_op(sb, &fuse_dentry_operations);
sb->s_root = d_make_root(root);
if (!sb->s_root)
return -ENOMEM;
@@ -1807,6 +1842,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
if (!sb_set_blocksize(sb, ctx->blksize))
goto err;
#endif
+ fc->sync_fs = 1;
} else {
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
@@ -1850,17 +1886,19 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
err = -ENOMEM;
root = fuse_get_root_inode(sb, ctx->rootmode);
- sb->s_d_op = &fuse_root_dentry_operations;
+ set_default_d_op(sb, &fuse_dentry_operations);
root_dentry = d_make_root(root);
if (!root_dentry)
goto err_dev_free;
- /* Root dentry doesn't have .d_revalidate */
- sb->s_d_op = &fuse_dentry_operations;
mutex_lock(&fuse_mutex);
err = -EINVAL;
- if (ctx->fudptr && *ctx->fudptr)
- goto err_unlock;
+ if (ctx->fudptr && *ctx->fudptr) {
+ if (*ctx->fudptr == FUSE_DEV_SYNC_INIT)
+ fc->sync_init = 1;
+ else
+ goto err_unlock;
+ }
err = fuse_ctl_add_conn(fc);
if (err)
@@ -1868,8 +1906,10 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
list_add_tail(&fc->entry, &fuse_conn_list);
sb->s_root = root_dentry;
- if (ctx->fudptr)
+ if (ctx->fudptr) {
*ctx->fudptr = fud;
+ wake_up_all(&fuse_dev_waitq);
+ }
mutex_unlock(&fuse_mutex);
return 0;
@@ -1890,6 +1930,7 @@ EXPORT_SYMBOL_GPL(fuse_fill_super_common);
static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
{
struct fuse_fs_context *ctx = fsc->fs_private;
+ struct fuse_mount *fm;
int err;
if (!ctx->file || !ctx->rootmode_present ||
@@ -1910,8 +1951,10 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
return err;
/* file->private_data shall be visible on all CPUs after this */
smp_mb();
- fuse_send_init(get_fuse_mount_super(sb));
- return 0;
+
+ fm = get_fuse_mount_super(sb);
+
+ return fuse_send_init(fm);
}
/*
@@ -1972,7 +2015,7 @@ static int fuse_get_tree(struct fs_context *fsc)
* Allow creating a fuse mount with an already initialized fuse
* connection
*/
- fud = READ_ONCE(ctx->file->private_data);
+ fud = __fuse_get_dev(ctx->file);
if (ctx->file->f_op == &fuse_dev_operations && fud) {
fsc->sget_key = fud->fc;
sb = sget_fc(fsc, fuse_test_super, fuse_set_no_super);
@@ -2243,6 +2286,8 @@ static int __init fuse_init(void)
if (res)
goto err_sysfs_cleanup;
+ fuse_dentry_tree_init();
+
sanitize_global_limit(&max_user_bgreq);
sanitize_global_limit(&max_user_congthresh);
@@ -2262,6 +2307,7 @@ static void __exit fuse_exit(void)
{
pr_debug("exit\n");
+ fuse_dentry_tree_cleanup();
fuse_ctl_cleanup();
fuse_sysfs_cleanup();
fuse_fs_cleanup();
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
index 2d9abf48828f..fdc175e93f74 100644
--- a/fs/fuse/ioctl.c
+++ b/fs/fuse/ioctl.c
@@ -502,7 +502,7 @@ static void fuse_priv_ioctl_cleanup(struct inode *inode, struct fuse_file *ff)
fuse_file_release(inode, ff, O_RDONLY, NULL, S_ISDIR(inode->i_mode));
}
-int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int fuse_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct fuse_file *ff;
@@ -540,7 +540,7 @@ cleanup:
}
int fuse_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct fuse_file *ff;
diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c
index c99e285f3183..3728933188f3 100644
--- a/fs/fuse/iomode.c
+++ b/fs/fuse/iomode.c
@@ -177,8 +177,7 @@ static int fuse_file_passthrough_open(struct inode *inode, struct file *file)
(ff->open_flags & ~FOPEN_PASSTHROUGH_MASK))
return -EINVAL;
- fb = fuse_passthrough_open(file, inode,
- ff->args->open_outarg.backing_id);
+ fb = fuse_passthrough_open(file, ff->args->open_outarg.backing_id);
if (IS_ERR(fb))
return PTR_ERR(fb);
diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
index 607ef735ad4a..72de97c03d0e 100644
--- a/fs/fuse/passthrough.c
+++ b/fs/fuse/passthrough.c
@@ -144,166 +144,12 @@ ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma)
return backing_file_mmap(backing_file, vma, &ctx);
}
-struct fuse_backing *fuse_backing_get(struct fuse_backing *fb)
-{
- if (fb && refcount_inc_not_zero(&fb->count))
- return fb;
- return NULL;
-}
-
-static void fuse_backing_free(struct fuse_backing *fb)
-{
- pr_debug("%s: fb=0x%p\n", __func__, fb);
-
- if (fb->file)
- fput(fb->file);
- put_cred(fb->cred);
- kfree_rcu(fb, rcu);
-}
-
-void fuse_backing_put(struct fuse_backing *fb)
-{
- if (fb && refcount_dec_and_test(&fb->count))
- fuse_backing_free(fb);
-}
-
-void fuse_backing_files_init(struct fuse_conn *fc)
-{
- idr_init(&fc->backing_files_map);
-}
-
-static int fuse_backing_id_alloc(struct fuse_conn *fc, struct fuse_backing *fb)
-{
- int id;
-
- idr_preload(GFP_KERNEL);
- spin_lock(&fc->lock);
- /* FIXME: xarray might be space inefficient */
- id = idr_alloc_cyclic(&fc->backing_files_map, fb, 1, 0, GFP_ATOMIC);
- spin_unlock(&fc->lock);
- idr_preload_end();
-
- WARN_ON_ONCE(id == 0);
- return id;
-}
-
-static struct fuse_backing *fuse_backing_id_remove(struct fuse_conn *fc,
- int id)
-{
- struct fuse_backing *fb;
-
- spin_lock(&fc->lock);
- fb = idr_remove(&fc->backing_files_map, id);
- spin_unlock(&fc->lock);
-
- return fb;
-}
-
-static int fuse_backing_id_free(int id, void *p, void *data)
-{
- struct fuse_backing *fb = p;
-
- WARN_ON_ONCE(refcount_read(&fb->count) != 1);
- fuse_backing_free(fb);
- return 0;
-}
-
-void fuse_backing_files_free(struct fuse_conn *fc)
-{
- idr_for_each(&fc->backing_files_map, fuse_backing_id_free, NULL);
- idr_destroy(&fc->backing_files_map);
-}
-
-int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map)
-{
- struct file *file;
- struct super_block *backing_sb;
- struct fuse_backing *fb = NULL;
- int res;
-
- pr_debug("%s: fd=%d flags=0x%x\n", __func__, map->fd, map->flags);
-
- /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */
- res = -EPERM;
- if (!fc->passthrough || !capable(CAP_SYS_ADMIN))
- goto out;
-
- res = -EINVAL;
- if (map->flags || map->padding)
- goto out;
-
- file = fget_raw(map->fd);
- res = -EBADF;
- if (!file)
- goto out;
-
- backing_sb = file_inode(file)->i_sb;
- res = -ELOOP;
- if (backing_sb->s_stack_depth >= fc->max_stack_depth)
- goto out_fput;
-
- fb = kmalloc(sizeof(struct fuse_backing), GFP_KERNEL);
- res = -ENOMEM;
- if (!fb)
- goto out_fput;
-
- fb->file = file;
- fb->cred = prepare_creds();
- refcount_set(&fb->count, 1);
-
- res = fuse_backing_id_alloc(fc, fb);
- if (res < 0) {
- fuse_backing_free(fb);
- fb = NULL;
- }
-
-out:
- pr_debug("%s: fb=0x%p, ret=%i\n", __func__, fb, res);
-
- return res;
-
-out_fput:
- fput(file);
- goto out;
-}
-
-int fuse_backing_close(struct fuse_conn *fc, int backing_id)
-{
- struct fuse_backing *fb = NULL;
- int err;
-
- pr_debug("%s: backing_id=%d\n", __func__, backing_id);
-
- /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */
- err = -EPERM;
- if (!fc->passthrough || !capable(CAP_SYS_ADMIN))
- goto out;
-
- err = -EINVAL;
- if (backing_id <= 0)
- goto out;
-
- err = -ENOENT;
- fb = fuse_backing_id_remove(fc, backing_id);
- if (!fb)
- goto out;
-
- fuse_backing_put(fb);
- err = 0;
-out:
- pr_debug("%s: fb=0x%p, err=%i\n", __func__, fb, err);
-
- return err;
-}
-
/*
* Setup passthrough to a backing file.
*
* Returns an fb object with elevated refcount to be stored in fuse inode.
*/
-struct fuse_backing *fuse_passthrough_open(struct file *file,
- struct inode *inode,
- int backing_id)
+struct fuse_backing *fuse_passthrough_open(struct file *file, int backing_id)
{
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fm->fc;
@@ -315,12 +161,8 @@ struct fuse_backing *fuse_passthrough_open(struct file *file,
if (backing_id <= 0)
goto out;
- rcu_read_lock();
- fb = idr_find(&fc->backing_files_map, backing_id);
- fb = fuse_backing_get(fb);
- rcu_read_unlock();
-
err = -ENOENT;
+ fb = fuse_backing_lookup(fc, backing_id);
if (!fb)
goto out;
diff --git a/fs/fuse/trace.c b/fs/fuse/trace.c
new file mode 100644
index 000000000000..93bd72efc98c
--- /dev/null
+++ b/fs/fuse/trace.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "dev_uring_i.h"
+#include "fuse_i.h"
+#include "fuse_dev_i.h"
+
+#include <linux/pagemap.h>
+
+#define CREATE_TRACE_POINTS
+#include "fuse_trace.h"
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 53c2626e90e7..b2f6486fe1d5 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -9,7 +9,6 @@
#include <linux/pci.h>
#include <linux/interrupt.h>
#include <linux/group_cpus.h>
-#include <linux/pfn_t.h>
#include <linux/memremap.h>
#include <linux/module.h>
#include <linux/virtio.h>
@@ -21,6 +20,7 @@
#include <linux/cleanup.h>
#include <linux/uio.h>
#include "fuse_i.h"
+#include "fuse_dev_i.h"
/* Used to help calculate the FUSE connection's max_pages limit for a request's
* size. Parts of the struct fuse_req are sliced into scattergather lists in
@@ -373,7 +373,7 @@ static int virtio_fs_add_queues_sysfs(struct virtio_fs *fs)
sprintf(buff, "%d", i);
fsvq->kobj = kobject_create_and_add(buff, fs->mqs_kobj);
- if (!fs->mqs_kobj) {
+ if (!fsvq->kobj) {
ret = -ENOMEM;
goto out_del;
}
@@ -762,7 +762,6 @@ static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req)
static void virtio_fs_request_complete(struct fuse_req *req,
struct virtio_fs_vq *fsvq)
{
- struct fuse_pqueue *fpq = &fsvq->fud->pq;
struct fuse_args *args;
struct fuse_args_pages *ap;
unsigned int len, i, thislen;
@@ -791,9 +790,7 @@ static void virtio_fs_request_complete(struct fuse_req *req,
}
}
- spin_lock(&fpq->lock);
clear_bit(FR_SENT, &req->flags);
- spin_unlock(&fpq->lock);
fuse_request_end(req);
spin_lock(&fsvq->lock);
@@ -862,7 +859,7 @@ static void virtio_fs_requests_done_work(struct work_struct *work)
static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *fs)
{
const struct cpumask *mask, *masks;
- unsigned int q, cpu;
+ unsigned int q, cpu, nr_masks;
/* First attempt to map using existing transport layer affinities
* e.g. PCIe MSI-X
@@ -882,7 +879,7 @@ static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *f
return;
fallback:
/* Attempt to map evenly in groups over the CPUs */
- masks = group_cpus_evenly(fs->num_request_queues);
+ masks = group_cpus_evenly(fs->num_request_queues, &nr_masks);
/* If even this fails we default to all CPUs use first request queue */
if (!masks) {
for_each_possible_cpu(cpu)
@@ -891,7 +888,7 @@ fallback:
}
for (q = 0; q < fs->num_request_queues; q++) {
- for_each_cpu(cpu, &masks[q])
+ for_each_cpu(cpu, &masks[q % nr_masks])
fs->mq_map[cpu] = q + VQ_REQUEST;
}
kfree(masks);
@@ -1008,7 +1005,7 @@ static void virtio_fs_cleanup_vqs(struct virtio_device *vdev)
*/
static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
long nr_pages, enum dax_access_mode mode,
- void **kaddr, pfn_t *pfn)
+ void **kaddr, unsigned long *pfn)
{
struct virtio_fs *fs = dax_get_private(dax_dev);
phys_addr_t offset = PFN_PHYS(pgoff);
@@ -1017,7 +1014,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
if (kaddr)
*kaddr = fs->window_kaddr + offset;
if (pfn)
- *pfn = phys_to_pfn_t(fs->window_phys_addr + offset, 0);
+ *pfn = PHYS_PFN(fs->window_phys_addr + offset);
return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
}
@@ -1385,7 +1382,7 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
unsigned int out_sgs = 0;
unsigned int in_sgs = 0;
unsigned int total_sgs;
- unsigned int i;
+ unsigned int i, hash;
int ret;
bool notify;
struct fuse_pqueue *fpq;
@@ -1445,8 +1442,9 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
/* Request successfully sent. */
fpq = &fsvq->fud->pq;
+ hash = fuse_req_hash(req->in.h.unique);
spin_lock(&fpq->lock);
- list_add_tail(&req->list, fpq->processing);
+ list_add_tail(&req->list, &fpq->processing[hash]);
spin_unlock(&fpq->lock);
set_bit(FR_SENT, &req->flags);
/* matches barrier in request_wait_answer() */
@@ -1481,8 +1479,7 @@ static void virtio_fs_send_req(struct fuse_iqueue *fiq, struct fuse_req *req)
struct virtio_fs_vq *fsvq;
int ret;
- if (req->in.h.opcode != FUSE_NOTIFY_REPLY)
- req->in.h.unique = fuse_get_unique(fiq);
+ fuse_request_assign_unique(fiq, req);
clear_bit(FR_PENDING, &req->flags);